(converter) Penalize chatgpt content farm spam

This commit is contained in:
Viktor Lofgren 2024-01-03 16:51:26 +01:00
parent 1e06aee6a2
commit f599944942

View File

@ -11,6 +11,7 @@ import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeVisitor;
import java.util.List;
import java.util.Set;
public class DocumentValuator {
@ -21,6 +22,7 @@ public class DocumentValuator {
int textLength) throws DisqualifiedException {
double scriptPenalty = getScriptPenalty(parsedDocument);
double chatGptPenalty = getChatGptContentFarmPenalty(parsedDocument);
int rawLength = crawledDocument.documentBody.length();
@ -30,7 +32,22 @@ public class DocumentValuator {
return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale
+ htmlStandard.offset
- scriptPenalty;
- scriptPenalty
- chatGptPenalty;
}
private double getChatGptContentFarmPenalty(Document parsedDocument) {
// easily 90% of modern AI-authored content farm spam have this exact string in one of the headings
for (String tagName : List.of("h1", "h2", "h3")) {
for (var elem : parsedDocument.getElementsByTag(tagName)) {
if (elem.text().startsWith("Benefits of")) {
return 10;
}
}
}
return 0;
}