Add bits and keywords for generator classes (docs, forum, wiki).
This commit is contained in:
parent
4c627d0e1d
commit
bd2c3855ed
@ -5,10 +5,10 @@ import java.util.EnumSet;
|
||||
public enum DocumentFlags {
|
||||
Javascript,
|
||||
PlainText,
|
||||
GeneratorSpammy,
|
||||
GeneratorVintage,
|
||||
GeneratorBlog,
|
||||
GeneratorForumWiki,
|
||||
GeneratorDocs,
|
||||
GeneratorForum,
|
||||
GeneratorWiki,
|
||||
Unused6,
|
||||
Unused7,
|
||||
Unused8,
|
||||
;
|
||||
|
@ -6,8 +6,9 @@ public enum GeneratorType {
|
||||
ZOOMER_STATIC,
|
||||
CMS,
|
||||
SAAS,
|
||||
MANUAL_RETRO,
|
||||
MANUAL_NEW,
|
||||
DOCS_FORUM_WIKI,
|
||||
MANUAL,
|
||||
FORUM,
|
||||
WIKI,
|
||||
DOCS,
|
||||
ECOMMERCE_AND_SPAM
|
||||
}
|
@ -4,6 +4,8 @@ import nu.marginalia.converting.model.GeneratorType;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/** Extract keywords for the document meta generator tag */
|
||||
@ -13,8 +15,10 @@ public class DocumentGeneratorExtractor {
|
||||
public DocumentGenerator generatorCleaned(Document doc) {
|
||||
|
||||
var tags = doc.select("meta[name=generator]");
|
||||
|
||||
if (tags.size() == 0) {
|
||||
return DocumentGenerator.unset();
|
||||
// Some sites have a comment in the head instead of a meta tag
|
||||
return fingerprintByComments(doc);
|
||||
}
|
||||
if (tags.size() > 1) {
|
||||
return DocumentGenerator.multiple();
|
||||
@ -22,7 +26,7 @@ public class DocumentGeneratorExtractor {
|
||||
String generator = tags.attr("content");
|
||||
|
||||
// Remove leading or trailing junk from the generator string, "powered by" etc.
|
||||
generator = trim(generator);
|
||||
generator = removePrefixOrSuffix(generator);
|
||||
|
||||
if (generator.isBlank())
|
||||
return DocumentGenerator.unset();
|
||||
@ -63,11 +67,29 @@ public class DocumentGeneratorExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
private String trim(String generator) {
|
||||
// Fallback logic when there is no meta tag
|
||||
private DocumentGenerator fingerprintByComments(Document doc) {
|
||||
|
||||
for (var comment : doc.getElementsByTag("head").comments()) {
|
||||
if (comment.getData().contains("Generated by javadoc")) {
|
||||
return DocumentGenerator.of("javadoc");
|
||||
}
|
||||
}
|
||||
|
||||
return DocumentGenerator.unset();
|
||||
}
|
||||
|
||||
private String removePrefixOrSuffix(String generator) {
|
||||
|
||||
generator = generator.toLowerCase().trim();
|
||||
if (generator.startsWith("powered by ")) {
|
||||
generator = generator.substring("powered by ".length());
|
||||
|
||||
// strip common prefixes
|
||||
for (String prefix : Arrays.asList("powered by ", "generated by ")) {
|
||||
|
||||
if (generator.startsWith(prefix)) {
|
||||
generator = generator.substring(prefix.length());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int dashIdx = generator.indexOf('-'); // Some strings have values like 'foobar 2.3 - the free online generator!'
|
||||
@ -82,7 +104,8 @@ public class DocumentGeneratorExtractor {
|
||||
}
|
||||
|
||||
// Censor exact version strings, being able to search by major version is enough
|
||||
// for any non-blackhat purpose
|
||||
// for any non-blackhat purpose; creating a directory with exact version string
|
||||
// is a security risk for the site owner.
|
||||
private String truncVersion(String part) {
|
||||
int periodIdx = part.indexOf('.', part.startsWith("0.") ? 2 : 0);
|
||||
|
||||
@ -101,6 +124,8 @@ public class DocumentGeneratorExtractor {
|
||||
if (parts.length == 0)
|
||||
return unset();
|
||||
|
||||
List<String> keywords = new ArrayList<>(List.of(parts));
|
||||
|
||||
final GeneratorType type = switch (parts[0]) {
|
||||
case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity",
|
||||
"modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms",
|
||||
@ -121,23 +146,29 @@ public class DocumentGeneratorExtractor {
|
||||
-> GeneratorType.BOOMER_STATIC;
|
||||
case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome"
|
||||
-> GeneratorType.ZOOMER_STATIC;
|
||||
case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano"
|
||||
-> GeneratorType.MANUAL_NEW;
|
||||
case "notepad.exe", "gedit", "me",
|
||||
case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano",
|
||||
"notepad.exe", "gedit", "me",
|
||||
"geany", "sublime", "notepad++", "author",
|
||||
"notepad", "namo", "arachnophilia", "scite",
|
||||
"alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa"
|
||||
-> GeneratorType.MANUAL_RETRO;
|
||||
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum",
|
||||
"discourse", "mediawiki", "dokuwiki", "pandoc", "mkdocs", "sharepoint", "doxygen"
|
||||
-> GeneratorType.DOCS_FORUM_WIKI;
|
||||
-> GeneratorType.MANUAL;
|
||||
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse"
|
||||
-> GeneratorType.FORUM;
|
||||
case "mediawiki", "dokuwiki", "sharepoint"
|
||||
-> GeneratorType.WIKI;
|
||||
case "pandoc", "mkdocs", "doxygen", "javadoc"
|
||||
-> GeneratorType.DOCS;
|
||||
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic"
|
||||
-> GeneratorType.ECOMMERCE_AND_SPAM;
|
||||
default
|
||||
-> GeneratorType.UNKNOWN;
|
||||
};
|
||||
|
||||
return new DocumentGenerator(type, List.of(parts));
|
||||
if (type != GeneratorType.UNKNOWN) {
|
||||
keywords.add(type.name().toLowerCase());
|
||||
}
|
||||
|
||||
return new DocumentGenerator(type, keywords);
|
||||
}
|
||||
|
||||
public static DocumentGenerator multiple() {
|
||||
|
@ -182,10 +182,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case ECOMMERCE_AND_SPAM -> flags.add(DocumentFlags.GeneratorSpammy);
|
||||
case DOCS_FORUM_WIKI -> flags.add(DocumentFlags.GeneratorForumWiki);
|
||||
case ZOOMER_STATIC, MANUAL_NEW -> flags.add(DocumentFlags.GeneratorBlog);
|
||||
case MANUAL_RETRO, BOOMER_STATIC -> flags.add(DocumentFlags.GeneratorVintage);
|
||||
case DOCS -> flags.add(DocumentFlags.GeneratorDocs);
|
||||
case FORUM -> flags.add(DocumentFlags.GeneratorForum);
|
||||
case WIKI -> flags.add(DocumentFlags.GeneratorWiki);
|
||||
default -> {} // no flags
|
||||
}
|
||||
|
||||
|
@ -64,26 +64,13 @@ public class RankingSearchSet implements SearchSet {
|
||||
|
||||
@Override
|
||||
public boolean contains(int urlId, long documentMetadata) {
|
||||
// For ranked search sets, exclude excessively commercial sites
|
||||
// TODO: Maybe this particular check should be moved up to the search service and be opt-in?
|
||||
if (DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorSpammy.asBit())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// This is the main check
|
||||
if (set.contains(urlId) || set.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// For the rest, let through some domains that are not in the set based on the generator tag
|
||||
if (identifier == SearchSetIdentifier.SMALLWEB) {
|
||||
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorBlog.asBit());
|
||||
}
|
||||
if (identifier == SearchSetIdentifier.RETRO) {
|
||||
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorVintage.asBit());
|
||||
}
|
||||
|
||||
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorForumWiki.asBit());
|
||||
// TODO
|
||||
return false;
|
||||
}
|
||||
|
||||
public void write() throws IOException {
|
||||
|
Loading…
Reference in New Issue
Block a user