Categorize generators by type
This is a great quality signal! Add the type as document bitflags by category.
This commit is contained in:
parent
f140e7d7c7
commit
b5ef67ed28
@ -5,10 +5,10 @@ import java.util.EnumSet;
|
||||
public enum DocumentFlags {
|
||||
Javascript,
|
||||
PlainText,
|
||||
Ads,
|
||||
Tracking,
|
||||
Unused5,
|
||||
Unused6,
|
||||
GeneratorSpammy,
|
||||
GeneratorVintage,
|
||||
GeneratorBlog,
|
||||
GeneratorForumWiki,
|
||||
Unused7,
|
||||
Unused8,
|
||||
;
|
||||
|
@ -0,0 +1,13 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
public enum GeneratorType {
|
||||
UNKNOWN,
|
||||
BOOMER_STATIC,
|
||||
ZOOMER_STATIC,
|
||||
CMS,
|
||||
SAAS,
|
||||
MANUAL_RETRO,
|
||||
MANUAL_NEW,
|
||||
DOCS_FORUM_WIKI,
|
||||
ECOMMERCE_AND_SPAM
|
||||
}
|
@ -29,4 +29,5 @@ public class ProcessedDocumentDetails {
|
||||
public List<EdgeUrl> feedLinks;
|
||||
|
||||
public DocumentMetadata metadata;
|
||||
public GeneratorType generator;
|
||||
}
|
||||
|
@ -1,61 +1,65 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/** Extract keywords for the document meta generator tag */
|
||||
public class DocumentGeneratorExtractor {
|
||||
private static final String defaultValue = "unset";
|
||||
|
||||
private final String defaultValue = "unset";
|
||||
public DocumentGenerator generatorCleaned(Document doc) {
|
||||
|
||||
public List<String> generatorCleaned(Document doc) {
|
||||
|
||||
String generator = doc
|
||||
.select("meta[name=generator]")
|
||||
.attr("content");
|
||||
var tags = doc.select("meta[name=generator]");
|
||||
if (tags.size() == 0) {
|
||||
return DocumentGenerator.unset();
|
||||
}
|
||||
if (tags.size() > 1) {
|
||||
return DocumentGenerator.multiple();
|
||||
}
|
||||
String generator = tags.attr("content");
|
||||
|
||||
// Remove leading or trailing junk from the generator string, "powered by" etc.
|
||||
generator = trim(generator);
|
||||
|
||||
if (generator.isBlank())
|
||||
return List.of(defaultValue);
|
||||
return DocumentGenerator.unset();
|
||||
|
||||
String[] parts = StringUtils.split(generator, " ,:!");
|
||||
if (parts.length == 0)
|
||||
return List.of(defaultValue);
|
||||
return DocumentGenerator.unset();
|
||||
|
||||
int slashIdx = parts[0].indexOf('/');
|
||||
if (slashIdx >= 0) {
|
||||
// mozilla and staroffice has a really weird format
|
||||
return List.of(parts[0].substring(0, slashIdx));
|
||||
return DocumentGenerator.of(parts[0].substring(0, slashIdx));
|
||||
}
|
||||
|
||||
if (parts.length > 3) {
|
||||
return List.of(defaultValue); // if it's still very long after trim(), it's probably a custom hand written message
|
||||
return DocumentGenerator.unset(); // if it's still very long after trim(), it's probably a custom hand written message
|
||||
}
|
||||
|
||||
switch (parts[0]) {
|
||||
case "joomla!":
|
||||
return List.of("joomla");
|
||||
return DocumentGenerator.of("joomla");
|
||||
case "plone":
|
||||
case "claris":
|
||||
case "one.com":
|
||||
case "wix.com":
|
||||
case "wpbakery":
|
||||
return List.of(parts[0]);
|
||||
return DocumentGenerator.of(parts[0]);
|
||||
case "adobe":
|
||||
case "microsoft":
|
||||
return List.of(parts[1]);
|
||||
return DocumentGenerator.of(parts[1]);
|
||||
}
|
||||
|
||||
if (parts.length > 1) {
|
||||
return List.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
|
||||
return DocumentGenerator.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
|
||||
}
|
||||
else {
|
||||
return List.of(parts[0]);
|
||||
return DocumentGenerator.of(parts[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -88,4 +92,60 @@ public class DocumentGeneratorExtractor {
|
||||
return part.substring(0, periodIdx);
|
||||
}
|
||||
|
||||
public record DocumentGenerator(GeneratorType type, List<String> keywords) {
|
||||
public static DocumentGenerator unset() {
|
||||
return new DocumentGenerator(GeneratorType.UNKNOWN, List.of(defaultValue));
|
||||
}
|
||||
|
||||
public static DocumentGenerator of(String... parts) {
|
||||
if (parts.length == 0)
|
||||
return unset();
|
||||
|
||||
final GeneratorType type = switch (parts[0]) {
|
||||
case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity",
|
||||
"modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms",
|
||||
"typo3", "dotnetnuke", "cms", "coremedia", "dspace"
|
||||
-> GeneratorType.CMS;
|
||||
case "wix.com", "one.com", "wpbakery", "claris", "wordpress.com", "hubspot",
|
||||
"visual_composer", "mobirise", "everweb", "rapidweaver", "shorthand",
|
||||
"visual", "nitropack",
|
||||
/* these are not SAAS but close enough */
|
||||
"redux", "bootply"
|
||||
-> GeneratorType.SAAS;
|
||||
case "staroffice", "word", "frontpage", "dreamweaver", "mshtml",
|
||||
"iweb", "excel", "wordperfect", "netscape", "corel", "powerpoint",
|
||||
"openoffice.org", "openoffice", "latex2html", "lotus", "homesite",
|
||||
"trellix", "yahoo", "libreoffice", "opera", "stone's_webwriter",
|
||||
"pdf2htmlex", "nvu", "mozilla", "golive", "tenfingers", "publisher",
|
||||
"allaire", "neooffice"
|
||||
-> GeneratorType.BOOMER_STATIC;
|
||||
case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome"
|
||||
-> GeneratorType.ZOOMER_STATIC;
|
||||
case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano"
|
||||
-> GeneratorType.MANUAL_NEW;
|
||||
case "notepad.exe", "gedit", "me",
|
||||
"geany", "sublime", "notepad++", "author",
|
||||
"notepad", "namo", "arachnophilia", "scite",
|
||||
"alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa"
|
||||
-> GeneratorType.MANUAL_RETRO;
|
||||
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum",
|
||||
"discourse", "mediawiki", "dokuwiki", "pandoc", "mkdocs", "sharepoint", "doxygen"
|
||||
-> GeneratorType.DOCS_FORUM_WIKI;
|
||||
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic"
|
||||
-> GeneratorType.ECOMMERCE_AND_SPAM;
|
||||
default
|
||||
-> GeneratorType.UNKNOWN;
|
||||
};
|
||||
|
||||
return new DocumentGenerator(type, List.of(parts));
|
||||
}
|
||||
|
||||
public static DocumentGenerator multiple() {
|
||||
// It's *generally* WordPress or the like that injects multiple generator tags
|
||||
|
||||
return new DocumentGenerator(GeneratorType.CMS, List.of(defaultValue));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
||||
@ -136,7 +137,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
ret.hashCode = dld.localitySensitiveHashCode();
|
||||
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
|
||||
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(features);
|
||||
|
||||
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
||||
|
||||
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
||||
|
||||
ret.metadata = new DocumentMetadata(
|
||||
documentLengthLogic.getEncodedAverageLength(dld),
|
||||
@ -146,7 +150,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
ret.description = getDescription(doc, words.importantWords);
|
||||
|
||||
List<String> generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
||||
|
||||
|
||||
ret.generator = generatorParts.type();
|
||||
|
||||
var tagWords = new MetaTagsBuilder()
|
||||
.addDomainCrawlData(crawledDomain)
|
||||
@ -154,7 +160,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
.addUrl(url)
|
||||
.addFeatures(features)
|
||||
.addFormat(standard)
|
||||
.addGenerator(generatorParts)
|
||||
.addGenerator(generatorParts.keywords())
|
||||
.build();
|
||||
|
||||
words.addAllSyntheticTerms(tagWords);
|
||||
@ -168,17 +174,19 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
return new DetailsWithWords(ret, words);
|
||||
}
|
||||
|
||||
private EnumSet<DocumentFlags> htmlFeatures2DocumentFlags(Set<HtmlFeature> features) {
|
||||
private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) {
|
||||
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);
|
||||
|
||||
if (features.contains(HtmlFeature.ADVERTISEMENT)) {
|
||||
flags.add(DocumentFlags.Ads);
|
||||
}
|
||||
if (features.contains(HtmlFeature.JS)) {
|
||||
flags.add(DocumentFlags.Javascript);
|
||||
}
|
||||
if (features.contains(HtmlFeature.TRACKING)) {
|
||||
flags.add(DocumentFlags.Tracking);
|
||||
|
||||
switch (type) {
|
||||
case ECOMMERCE_AND_SPAM -> flags.add(DocumentFlags.GeneratorSpammy);
|
||||
case DOCS_FORUM_WIKI -> flags.add(DocumentFlags.GeneratorForumWiki);
|
||||
case ZOOMER_STATIC, MANUAL_NEW -> flags.add(DocumentFlags.GeneratorBlog);
|
||||
case MANUAL_RETRO, BOOMER_STATIC -> flags.add(DocumentFlags.GeneratorVintage);
|
||||
default -> {} // no flags
|
||||
}
|
||||
|
||||
return flags;
|
||||
|
Loading…
Reference in New Issue
Block a user