Improved publishing date heuristics

This commit is contained in:
vlofgren 2022-10-29 11:20:01 +02:00
parent 68cde1c3d8
commit 217584126c
10 changed files with 247 additions and 23 deletions

View File

@ -6,8 +6,10 @@ import java.time.format.DateTimeFormatter;
public record PubDate(String dateIso8601, int year) {
// First year we'll believe something can have been published on the web
// ... Tim Berners Lee's recipe collection or something
public static final int MIN_YEAR = 1989;
// cut off at 1995 to reduce false positive error rate; number of bona fide
// documents from these years are so few almost all hits are wrong
public static final int MIN_YEAR = 1995;
// Last year we'll believe something can be published in
public static final int MAX_YEAR = LocalDate.now().getYear() + 1;

View File

@ -9,15 +9,11 @@ import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.Random;
import java.util.concurrent.ThreadLocalRandom;
import java.util.regex.Pattern;
public class PubDateParser {
// ThreadLocalRandom lacks a few methods we need out of Random
private static ThreadLocal<Random> localRandom = ThreadLocal.withInitial(Random::new);
public static Optional<PubDate> attemptParseDate(String date) {
return Optional.ofNullable(date)
.filter(str -> str.length() >= 4 && str.length() < 32)
@ -66,7 +62,7 @@ public class PubDateParser {
return Optional.of(new PubDate(null, guessYear(min, max)));
}
if (max > PubDate.MIN_YEAR)
if (max >= PubDate.MIN_YEAR)
return Optional.of(new PubDate(null, max));
else
return Optional.empty();
@ -98,7 +94,7 @@ public class PubDateParser {
return Optional.of(new PubDate(null, guessYear(min, max, guess)));
}
if (max > PubDate.MIN_YEAR)
if (max >= PubDate.MIN_YEAR)
return Optional.of(new PubDate(null, max));
else
return Optional.empty();

View File

@ -21,12 +21,15 @@ public class PubDateSniffer {
heuristics.add(new PubDateHeuristicRDFaTag());
// The more questionable heuristics should be kept below this line
heuristics.add(new PubDateHeuristicUrlPatternPass1());
heuristics.add(new PubDateHeuristicUrlPattern());
heuristics.add(new PubDateHeuristicDOMParsingPass1());
heuristics.add(new PubDateHeuristicHtml5AnyTimeTag());
heuristics.add(new PubDateHeuristicDOMParsing());
heuristics.add(new PubDateHeuristicLastModified());
heuristics.add(new PubDateHeuristicDOMParsingPass2());
heuristics.add(new PubDateHeuristicUrlPatternPass2());
heuristics.add(new PubDateHeuristicLastModified());
// This is complete guesswork
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());

View File

@ -15,14 +15,14 @@ import org.jsoup.select.NodeFilter;
import java.util.Optional;
public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
if (effortLevel == PubDateEffortLevel.LOW)
return Optional.empty();
DateExtractingNodeVisitor filter = new DateExtractingNodeVisitor(htmlStandard);
DateExtractingNodeVisitorPass filter = new DateExtractingNodeVisitorPass(htmlStandard);
document.filter(filter);
@ -30,11 +30,11 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
}
private static class DateExtractingNodeVisitor implements NodeFilter {
private static class DateExtractingNodeVisitorPass implements NodeFilter {
public PubDate pubDate;
private final EdgeHtmlStandard htmlStandard;
private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) {
private DateExtractingNodeVisitorPass(EdgeHtmlStandard htmlStandard) {
this.htmlStandard = htmlStandard;
}
@ -53,11 +53,12 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
public void onTextNode(TextNode tn) {
String text = tn.getWholeText();
if (isCandidatForCopyrightNotice(text)) {
if (text.length() < 32 && isCandidatForCopyrightNotice(text)) {
parse(text);
}
}
public void onElementNode(Element el) {
if (hasCommonClass(el)) {
parse(el.text());
@ -89,7 +90,7 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
|| classes.contains("byline")
|| classes.contains("author")
|| classes.contains("submitted")
|| classes.contains("footer-info-lastmod"); // mediawiki
|| el.id().contains("footer-info-lastmod"); // mediawiki
}
public void tryParsePhpBBDate(Element el) {
@ -144,5 +145,4 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
}
}

View File

@ -0,0 +1,123 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeFilter;
import java.util.Optional;
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
if (effortLevel == PubDateEffortLevel.LOW)
return Optional.empty();
DateExtractingNodeVisitor filter = new DateExtractingNodeVisitor(htmlStandard);
document.filter(filter);
return Optional.ofNullable(filter.pubDate);
}
private static class DateExtractingNodeVisitor implements NodeFilter {
public PubDate pubDate;
private final EdgeHtmlStandard htmlStandard;
private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) {
this.htmlStandard = htmlStandard;
}
@NotNull
@Override
public FilterResult head(@NotNull Node node, int depth) {
if (node instanceof TextNode tn) onTextNode(tn);
if (hasPubDate()) {
return FilterResult.STOP;
}
return FilterResult.CONTINUE;
}
public void onTextNode(TextNode tn) {
String text = tn.getWholeText();
if (isPossibleCandidate(text)) {
parse(text);
}
}
public boolean hasPubDate() {
return pubDate != null;
}
public void setPubDate(PubDate pubDate) {
this.pubDate = pubDate;
}
@NotNull
@Override
public FilterResult tail(@NotNull Node node, int depth) {
return FilterResult.CONTINUE;
}
private void parse(String text) {
if (htmlStandard == EdgeHtmlStandard.UNKNOWN) {
PubDateParser
.dateFromHighestYearLookingSubstring(text)
.ifPresent(this::setPubDate);
}
else {
PubDateParser
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
.ifPresent(this::setPubDate);
}
}
}
// This is basically the regex (^|[ ./\-])(\d{4})([ ./\-]$), but
// unchecked regexes are too slow
public static boolean isPossibleCandidate(String text) {
if (text.length() >= 4 && text.length() < 24) {
int ct = 0;
char prevC = ' ';
boolean goodStart = true;
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (Character.isDigit(c)) {
if (ct++ == 0) {
goodStart = isGoodBreak(prevC);
}
}
else {
if (ct == 4 && goodStart && isGoodBreak(c)) return true;
else {
ct = 0;
}
}
prevC = c;
}
if (ct == 4 && goodStart)
return true;
}
return false;
}
private static boolean isGoodBreak(char c) {
return "./-,".indexOf(c) >= 0 || Character.isSpaceChar(c);
}
}

View File

@ -21,7 +21,7 @@ public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
return maybeDate;
}
maybeDate = PubDateParser.attemptParseDate(tag.text());
maybeDate = PubDateParser.attemptParseDate(tag.wholeText());
if (maybeDate.isPresent()) {
return maybeDate;
}

View File

@ -0,0 +1,45 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.regex.Pattern;
public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
// False positive rate is much higher in the 1990s, only include 2000s+ in pass 1
private static final int MIN_URL_PATTERN_YEAR = 2000;
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
final String urlString = url.path;
var matcher = yearUrlPattern.matcher(urlString);
for (int i = 0; i < urlString.length() && matcher.find(i); i = matcher.end()) {
String segment = urlString.substring(matcher.start() + 1, matcher.end() - 1);
OptionalInt year = PubDateParser.parseYearString(segment);
if (year.isEmpty())
continue;
int y = year.getAsInt();
if (y >= MIN_URL_PATTERN_YEAR && y <= PubDate.MAX_YEAR) {
return Optional.of(new PubDate(null, y));
}
}
return Optional.empty();
}
}

View File

@ -12,7 +12,7 @@ import java.util.Optional;
import java.util.OptionalInt;
import java.util.regex.Pattern;
public class PubDateHeuristicUrlPattern implements PubDateHeuristic {
public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");

View File

@ -4,13 +4,14 @@ public enum EdgeHtmlStandard {
PLAIN(0, 1, 1993),
UNKNOWN(0, 1, 2000),
HTML123(0, 1, 1997),
HTML4(-0.1, 1.05, 2008),
XHTML(-0.1, 1.05, 2005),
HTML4(-0.1, 1.05, 2006),
XHTML(-0.1, 1.05, 2006),
HTML5(0.5, 1.1, 2018);
public final double offset;
public final double scale;
public int yearGuess;
public final int yearGuess;
EdgeHtmlStandard(double offset, double scale, int yearGuess) {
this.offset = offset;

View File

@ -2,12 +2,16 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.*;
@ -53,6 +57,10 @@ class PubDateSnifferTest {
assertEquals("2018-10-21", ret.get().dateIso8601());
assertEquals(2018, ret.get().year());
ret = PubDateParser.attemptParseDate("July 13, 2006");
assertTrue(ret.isPresent());
assertEquals(2006, ret.get().year());
}
@ -89,6 +97,39 @@ class PubDateSnifferTest {
assertEquals("2022-08-24", ret.dateIso8601());
}
@Test
public void testHtml5C() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse("""
<!doctype html>
<html>
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
Wow, sure lor 'em boss
</article>
"""), EdgeHtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty());
assertEquals(2006, ret.year());
}
@Test
public void testProblemCases() throws IOException, URISyntaxException {
var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/The Switch to Linux Begins .html"))), EdgeHtmlStandard.HTML5, true);
assertFalse(ret.isEmpty());
assertEquals(2006, ret.year());
ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), EdgeHtmlStandard.XHTML, true);
assertFalse(ret.isEmpty());
assertEquals(2010, ret.year());
}
@Test
public void testGuessYear() {
System.out.println(PubDateParser.guessYear(2010, 2020));
@ -168,6 +209,8 @@ class PubDateSnifferTest {
assertFalse(ret.isEmpty());
assertEquals("2022-02-03", ret.dateIso8601());
}
@Test
public void testDOM() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",
@ -183,6 +226,17 @@ class PubDateSnifferTest {
assertEquals(2015, ret.year());
}
@Test
public void testCandidate() {
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 2007"));
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 2007-01-01"));
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 01-01.2007"));
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("Only $1999"));
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("1998B"));
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("1998B"));
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("2010 black hat ™"));
}
@Test
public void testOldInvision() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",