Add bits and keywords for generator classes (docs, forum, wiki).

This commit is contained in:
Viktor Lofgren 2023-06-23 21:35:28 +02:00
parent 4c627d0e1d
commit bd2c3855ed
5 changed files with 58 additions and 40 deletions

View File

@ -5,10 +5,10 @@ import java.util.EnumSet;
public enum DocumentFlags {
Javascript,
PlainText,
GeneratorSpammy,
GeneratorVintage,
GeneratorBlog,
GeneratorForumWiki,
GeneratorDocs,
GeneratorForum,
GeneratorWiki,
Unused6,
Unused7,
Unused8,
;

View File

@ -6,8 +6,9 @@ public enum GeneratorType {
ZOOMER_STATIC,
CMS,
SAAS,
MANUAL_RETRO,
MANUAL_NEW,
DOCS_FORUM_WIKI,
MANUAL,
FORUM,
WIKI,
DOCS,
ECOMMERCE_AND_SPAM
}

View File

@ -4,6 +4,8 @@ import nu.marginalia.converting.model.GeneratorType;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/** Extract keywords for the document meta generator tag */
@ -13,8 +15,10 @@ public class DocumentGeneratorExtractor {
public DocumentGenerator generatorCleaned(Document doc) {
var tags = doc.select("meta[name=generator]");
if (tags.size() == 0) {
return DocumentGenerator.unset();
// Some sites have a comment in the head instead of a meta tag
return fingerprintByComments(doc);
}
if (tags.size() > 1) {
return DocumentGenerator.multiple();
@ -22,7 +26,7 @@ public class DocumentGeneratorExtractor {
String generator = tags.attr("content");
// Remove leading or trailing junk from the generator string, "powered by" etc.
generator = trim(generator);
generator = removePrefixOrSuffix(generator);
if (generator.isBlank())
return DocumentGenerator.unset();
@ -63,11 +67,29 @@ public class DocumentGeneratorExtractor {
}
}
private String trim(String generator) {
// Fallback logic when there is no meta tag
private DocumentGenerator fingerprintByComments(Document doc) {
for (var comment : doc.getElementsByTag("head").comments()) {
if (comment.getData().contains("Generated by javadoc")) {
return DocumentGenerator.of("javadoc");
}
}
return DocumentGenerator.unset();
}
private String removePrefixOrSuffix(String generator) {
generator = generator.toLowerCase().trim();
if (generator.startsWith("powered by ")) {
generator = generator.substring("powered by ".length());
// strip common prefixes
for (String prefix : Arrays.asList("powered by ", "generated by ")) {
if (generator.startsWith(prefix)) {
generator = generator.substring(prefix.length());
break;
}
}
int dashIdx = generator.indexOf('-'); // Some strings have values like 'foobar 2.3 - the free online generator!'
@ -82,7 +104,8 @@ public class DocumentGeneratorExtractor {
}
// Censor exact version strings, being able to search by major version is enough
// for any non-blackhat purpose
// for any non-blackhat purpose; creating a directory with exact version string
// is a security risk for the site owner.
private String truncVersion(String part) {
int periodIdx = part.indexOf('.', part.startsWith("0.") ? 2 : 0);
@ -101,6 +124,8 @@ public class DocumentGeneratorExtractor {
if (parts.length == 0)
return unset();
List<String> keywords = new ArrayList<>(List.of(parts));
final GeneratorType type = switch (parts[0]) {
case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity",
"modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms",
@ -121,23 +146,29 @@ public class DocumentGeneratorExtractor {
-> GeneratorType.BOOMER_STATIC;
case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome"
-> GeneratorType.ZOOMER_STATIC;
case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano"
-> GeneratorType.MANUAL_NEW;
case "notepad.exe", "gedit", "me",
case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano",
"notepad.exe", "gedit", "me",
"geany", "sublime", "notepad++", "author",
"notepad", "namo", "arachnophilia", "scite",
"alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa"
-> GeneratorType.MANUAL_RETRO;
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum",
"discourse", "mediawiki", "dokuwiki", "pandoc", "mkdocs", "sharepoint", "doxygen"
-> GeneratorType.DOCS_FORUM_WIKI;
-> GeneratorType.MANUAL;
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse"
-> GeneratorType.FORUM;
case "mediawiki", "dokuwiki", "sharepoint"
-> GeneratorType.WIKI;
case "pandoc", "mkdocs", "doxygen", "javadoc"
-> GeneratorType.DOCS;
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic"
-> GeneratorType.ECOMMERCE_AND_SPAM;
default
-> GeneratorType.UNKNOWN;
};
return new DocumentGenerator(type, List.of(parts));
if (type != GeneratorType.UNKNOWN) {
keywords.add(type.name().toLowerCase());
}
return new DocumentGenerator(type, keywords);
}
public static DocumentGenerator multiple() {

View File

@ -182,10 +182,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
}
switch (type) {
case ECOMMERCE_AND_SPAM -> flags.add(DocumentFlags.GeneratorSpammy);
case DOCS_FORUM_WIKI -> flags.add(DocumentFlags.GeneratorForumWiki);
case ZOOMER_STATIC, MANUAL_NEW -> flags.add(DocumentFlags.GeneratorBlog);
case MANUAL_RETRO, BOOMER_STATIC -> flags.add(DocumentFlags.GeneratorVintage);
case DOCS -> flags.add(DocumentFlags.GeneratorDocs);
case FORUM -> flags.add(DocumentFlags.GeneratorForum);
case WIKI -> flags.add(DocumentFlags.GeneratorWiki);
default -> {} // no flags
}

View File

@ -64,26 +64,13 @@ public class RankingSearchSet implements SearchSet {
@Override
public boolean contains(int urlId, long documentMetadata) {
// For ranked search sets, exclude excessively commercial sites
// TODO: Maybe this particular check should be moved up to the search service and be opt-in?
if (DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorSpammy.asBit())) {
return false;
}
// This is the main check
if (set.contains(urlId) || set.isEmpty()) {
return true;
}
// For the rest, let through some domains that are not in the set based on the generator tag
if (identifier == SearchSetIdentifier.SMALLWEB) {
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorBlog.asBit());
}
if (identifier == SearchSetIdentifier.RETRO) {
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorVintage.asBit());
}
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorForumWiki.asBit());
// TODO
return false;
}
public void write() throws IOException {