Categorize generators by type

This is a great quality signal!
Add the type as document bitflags by category.
This commit is contained in:
Viktor Lofgren 2023-06-22 16:04:37 +02:00
parent f140e7d7c7
commit b5ef67ed28
5 changed files with 111 additions and 29 deletions

View File

@ -5,10 +5,10 @@ import java.util.EnumSet;
public enum DocumentFlags {
Javascript,
PlainText,
Ads,
Tracking,
Unused5,
Unused6,
GeneratorSpammy,
GeneratorVintage,
GeneratorBlog,
GeneratorForumWiki,
Unused7,
Unused8,
;

View File

@ -0,0 +1,13 @@
package nu.marginalia.converting.model;
public enum GeneratorType {
UNKNOWN,
BOOMER_STATIC,
ZOOMER_STATIC,
CMS,
SAAS,
MANUAL_RETRO,
MANUAL_NEW,
DOCS_FORUM_WIKI,
ECOMMERCE_AND_SPAM
}

View File

@ -29,4 +29,5 @@ public class ProcessedDocumentDetails {
public List<EdgeUrl> feedLinks;
public DocumentMetadata metadata;
public GeneratorType generator;
}

View File

@ -1,61 +1,65 @@
package nu.marginalia.converting.processor.logic;
import nu.marginalia.converting.model.GeneratorType;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import java.util.Collections;
import java.util.List;
/** Extract keywords for the document meta generator tag */
public class DocumentGeneratorExtractor {
private static final String defaultValue = "unset";
private final String defaultValue = "unset";
public DocumentGenerator generatorCleaned(Document doc) {
public List<String> generatorCleaned(Document doc) {
String generator = doc
.select("meta[name=generator]")
.attr("content");
var tags = doc.select("meta[name=generator]");
if (tags.size() == 0) {
return DocumentGenerator.unset();
}
if (tags.size() > 1) {
return DocumentGenerator.multiple();
}
String generator = tags.attr("content");
// Remove leading or trailing junk from the generator string, "powered by" etc.
generator = trim(generator);
if (generator.isBlank())
return List.of(defaultValue);
return DocumentGenerator.unset();
String[] parts = StringUtils.split(generator, " ,:!");
if (parts.length == 0)
return List.of(defaultValue);
return DocumentGenerator.unset();
int slashIdx = parts[0].indexOf('/');
if (slashIdx >= 0) {
// mozilla and staroffice has a really weird format
return List.of(parts[0].substring(0, slashIdx));
return DocumentGenerator.of(parts[0].substring(0, slashIdx));
}
if (parts.length > 3) {
return List.of(defaultValue); // if it's still very long after trim(), it's probably a custom hand written message
return DocumentGenerator.unset(); // if it's still very long after trim(), it's probably a custom hand written message
}
switch (parts[0]) {
case "joomla!":
return List.of("joomla");
return DocumentGenerator.of("joomla");
case "plone":
case "claris":
case "one.com":
case "wix.com":
case "wpbakery":
return List.of(parts[0]);
return DocumentGenerator.of(parts[0]);
case "adobe":
case "microsoft":
return List.of(parts[1]);
return DocumentGenerator.of(parts[1]);
}
if (parts.length > 1) {
return List.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
return DocumentGenerator.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
}
else {
return List.of(parts[0]);
return DocumentGenerator.of(parts[0]);
}
}
@ -88,4 +92,60 @@ public class DocumentGeneratorExtractor {
return part.substring(0, periodIdx);
}
public record DocumentGenerator(GeneratorType type, List<String> keywords) {
public static DocumentGenerator unset() {
return new DocumentGenerator(GeneratorType.UNKNOWN, List.of(defaultValue));
}
public static DocumentGenerator of(String... parts) {
if (parts.length == 0)
return unset();
final GeneratorType type = switch (parts[0]) {
case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity",
"modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms",
"typo3", "dotnetnuke", "cms", "coremedia", "dspace"
-> GeneratorType.CMS;
case "wix.com", "one.com", "wpbakery", "claris", "wordpress.com", "hubspot",
"visual_composer", "mobirise", "everweb", "rapidweaver", "shorthand",
"visual", "nitropack",
/* these are not SAAS but close enough */
"redux", "bootply"
-> GeneratorType.SAAS;
case "staroffice", "word", "frontpage", "dreamweaver", "mshtml",
"iweb", "excel", "wordperfect", "netscape", "corel", "powerpoint",
"openoffice.org", "openoffice", "latex2html", "lotus", "homesite",
"trellix", "yahoo", "libreoffice", "opera", "stone's_webwriter",
"pdf2htmlex", "nvu", "mozilla", "golive", "tenfingers", "publisher",
"allaire", "neooffice"
-> GeneratorType.BOOMER_STATIC;
case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome"
-> GeneratorType.ZOOMER_STATIC;
case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano"
-> GeneratorType.MANUAL_NEW;
case "notepad.exe", "gedit", "me",
"geany", "sublime", "notepad++", "author",
"notepad", "namo", "arachnophilia", "scite",
"alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa"
-> GeneratorType.MANUAL_RETRO;
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum",
"discourse", "mediawiki", "dokuwiki", "pandoc", "mkdocs", "sharepoint", "doxygen"
-> GeneratorType.DOCS_FORUM_WIKI;
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic"
-> GeneratorType.ECOMMERCE_AND_SPAM;
default
-> GeneratorType.UNKNOWN;
};
return new DocumentGenerator(type, List.of(parts));
}
public static DocumentGenerator multiple() {
// It's *generally* WordPress or the like that injects multiple generator tags
return new DocumentGenerator(GeneratorType.CMS, List.of(defaultValue));
}
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.processor.MetaRobotsTag;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
@ -136,7 +137,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.hashCode = dld.localitySensitiveHashCode();
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(features);
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
ret.metadata = new DocumentMetadata(
documentLengthLogic.getEncodedAverageLength(dld),
@ -146,7 +150,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.description = getDescription(doc, words.importantWords);
List<String> generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
ret.generator = generatorParts.type();
var tagWords = new MetaTagsBuilder()
.addDomainCrawlData(crawledDomain)
@ -154,7 +160,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
.addUrl(url)
.addFeatures(features)
.addFormat(standard)
.addGenerator(generatorParts)
.addGenerator(generatorParts.keywords())
.build();
words.addAllSyntheticTerms(tagWords);
@ -168,17 +174,19 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
return new DetailsWithWords(ret, words);
}
private EnumSet<DocumentFlags> htmlFeatures2DocumentFlags(Set<HtmlFeature> features) {
private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) {
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);
if (features.contains(HtmlFeature.ADVERTISEMENT)) {
flags.add(DocumentFlags.Ads);
}
if (features.contains(HtmlFeature.JS)) {
flags.add(DocumentFlags.Javascript);
}
if (features.contains(HtmlFeature.TRACKING)) {
flags.add(DocumentFlags.Tracking);
switch (type) {
case ECOMMERCE_AND_SPAM -> flags.add(DocumentFlags.GeneratorSpammy);
case DOCS_FORUM_WIKI -> flags.add(DocumentFlags.GeneratorForumWiki);
case ZOOMER_STATIC, MANUAL_NEW -> flags.add(DocumentFlags.GeneratorBlog);
case MANUAL_RETRO, BOOMER_STATIC -> flags.add(DocumentFlags.GeneratorVintage);
default -> {} // no flags
}
return flags;