Improved publishing date heuristics
This commit is contained in:
parent
68cde1c3d8
commit
217584126c
@ -6,8 +6,10 @@ import java.time.format.DateTimeFormatter;
|
||||
public record PubDate(String dateIso8601, int year) {
|
||||
|
||||
// First year we'll believe something can have been published on the web
|
||||
// ... Tim Berners Lee's recipe collection or something
|
||||
public static final int MIN_YEAR = 1989;
|
||||
// cut off at 1995 to reduce false positive error rate; number of bona fide
|
||||
// documents from these years are so few almost all hits are wrong
|
||||
|
||||
public static final int MIN_YEAR = 1995;
|
||||
|
||||
// Last year we'll believe something can be published in
|
||||
public static final int MAX_YEAR = LocalDate.now().getYear() + 1;
|
||||
|
@ -9,15 +9,11 @@ import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Optional;
|
||||
import java.util.OptionalInt;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class PubDateParser {
|
||||
|
||||
// ThreadLocalRandom lacks a few methods we need out of Random
|
||||
private static ThreadLocal<Random> localRandom = ThreadLocal.withInitial(Random::new);
|
||||
|
||||
public static Optional<PubDate> attemptParseDate(String date) {
|
||||
return Optional.ofNullable(date)
|
||||
.filter(str -> str.length() >= 4 && str.length() < 32)
|
||||
@ -66,7 +62,7 @@ public class PubDateParser {
|
||||
return Optional.of(new PubDate(null, guessYear(min, max)));
|
||||
}
|
||||
|
||||
if (max > PubDate.MIN_YEAR)
|
||||
if (max >= PubDate.MIN_YEAR)
|
||||
return Optional.of(new PubDate(null, max));
|
||||
else
|
||||
return Optional.empty();
|
||||
@ -98,7 +94,7 @@ public class PubDateParser {
|
||||
return Optional.of(new PubDate(null, guessYear(min, max, guess)));
|
||||
}
|
||||
|
||||
if (max > PubDate.MIN_YEAR)
|
||||
if (max >= PubDate.MIN_YEAR)
|
||||
return Optional.of(new PubDate(null, max));
|
||||
else
|
||||
return Optional.empty();
|
||||
|
@ -21,12 +21,15 @@ public class PubDateSniffer {
|
||||
heuristics.add(new PubDateHeuristicRDFaTag());
|
||||
|
||||
// The more questionable heuristics should be kept below this line
|
||||
heuristics.add(new PubDateHeuristicUrlPatternPass1());
|
||||
|
||||
heuristics.add(new PubDateHeuristicUrlPattern());
|
||||
heuristics.add(new PubDateHeuristicDOMParsingPass1());
|
||||
heuristics.add(new PubDateHeuristicHtml5AnyTimeTag());
|
||||
heuristics.add(new PubDateHeuristicDOMParsing());
|
||||
heuristics.add(new PubDateHeuristicLastModified());
|
||||
|
||||
heuristics.add(new PubDateHeuristicDOMParsingPass2());
|
||||
heuristics.add(new PubDateHeuristicUrlPatternPass2());
|
||||
|
||||
heuristics.add(new PubDateHeuristicLastModified());
|
||||
// This is complete guesswork
|
||||
|
||||
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
||||
|
@ -15,14 +15,14 @@ import org.jsoup.select.NodeFilter;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
|
||||
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||
if (effortLevel == PubDateEffortLevel.LOW)
|
||||
return Optional.empty();
|
||||
|
||||
DateExtractingNodeVisitor filter = new DateExtractingNodeVisitor(htmlStandard);
|
||||
DateExtractingNodeVisitorPass filter = new DateExtractingNodeVisitorPass(htmlStandard);
|
||||
|
||||
document.filter(filter);
|
||||
|
||||
@ -30,11 +30,11 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
|
||||
}
|
||||
|
||||
|
||||
private static class DateExtractingNodeVisitor implements NodeFilter {
|
||||
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
||||
public PubDate pubDate;
|
||||
private final EdgeHtmlStandard htmlStandard;
|
||||
|
||||
private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) {
|
||||
private DateExtractingNodeVisitorPass(EdgeHtmlStandard htmlStandard) {
|
||||
this.htmlStandard = htmlStandard;
|
||||
}
|
||||
|
||||
@ -53,11 +53,12 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
|
||||
public void onTextNode(TextNode tn) {
|
||||
String text = tn.getWholeText();
|
||||
|
||||
if (isCandidatForCopyrightNotice(text)) {
|
||||
if (text.length() < 32 && isCandidatForCopyrightNotice(text)) {
|
||||
parse(text);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void onElementNode(Element el) {
|
||||
if (hasCommonClass(el)) {
|
||||
parse(el.text());
|
||||
@ -89,7 +90,7 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
|
||||
|| classes.contains("byline")
|
||||
|| classes.contains("author")
|
||||
|| classes.contains("submitted")
|
||||
|| classes.contains("footer-info-lastmod"); // mediawiki
|
||||
|| el.id().contains("footer-info-lastmod"); // mediawiki
|
||||
}
|
||||
|
||||
public void tryParsePhpBBDate(Element el) {
|
||||
@ -144,5 +145,4 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,123 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.jsoup.select.NodeFilter;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||
if (effortLevel == PubDateEffortLevel.LOW)
|
||||
return Optional.empty();
|
||||
|
||||
DateExtractingNodeVisitor filter = new DateExtractingNodeVisitor(htmlStandard);
|
||||
|
||||
document.filter(filter);
|
||||
|
||||
return Optional.ofNullable(filter.pubDate);
|
||||
}
|
||||
|
||||
|
||||
private static class DateExtractingNodeVisitor implements NodeFilter {
|
||||
public PubDate pubDate;
|
||||
private final EdgeHtmlStandard htmlStandard;
|
||||
|
||||
private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) {
|
||||
this.htmlStandard = htmlStandard;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public FilterResult head(@NotNull Node node, int depth) {
|
||||
if (node instanceof TextNode tn) onTextNode(tn);
|
||||
|
||||
if (hasPubDate()) {
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
|
||||
public void onTextNode(TextNode tn) {
|
||||
String text = tn.getWholeText();
|
||||
|
||||
if (isPossibleCandidate(text)) {
|
||||
parse(text);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean hasPubDate() {
|
||||
return pubDate != null;
|
||||
}
|
||||
public void setPubDate(PubDate pubDate) {
|
||||
this.pubDate = pubDate;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public FilterResult tail(@NotNull Node node, int depth) {
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
|
||||
private void parse(String text) {
|
||||
if (htmlStandard == EdgeHtmlStandard.UNKNOWN) {
|
||||
PubDateParser
|
||||
.dateFromHighestYearLookingSubstring(text)
|
||||
.ifPresent(this::setPubDate);
|
||||
}
|
||||
else {
|
||||
PubDateParser
|
||||
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
|
||||
.ifPresent(this::setPubDate);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// This is basically the regex (^|[ ./\-])(\d{4})([ ./\-]$), but
|
||||
// unchecked regexes are too slow
|
||||
|
||||
public static boolean isPossibleCandidate(String text) {
|
||||
if (text.length() >= 4 && text.length() < 24) {
|
||||
int ct = 0;
|
||||
char prevC = ' ';
|
||||
boolean goodStart = true;
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
if (Character.isDigit(c)) {
|
||||
if (ct++ == 0) {
|
||||
goodStart = isGoodBreak(prevC);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (ct == 4 && goodStart && isGoodBreak(c)) return true;
|
||||
else {
|
||||
ct = 0;
|
||||
}
|
||||
}
|
||||
prevC = c;
|
||||
}
|
||||
|
||||
if (ct == 4 && goodStart)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean isGoodBreak(char c) {
|
||||
return "./-,".indexOf(c) >= 0 || Character.isSpaceChar(c);
|
||||
}
|
||||
|
||||
}
|
@ -21,7 +21,7 @@ public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
||||
return maybeDate;
|
||||
}
|
||||
|
||||
maybeDate = PubDateParser.attemptParseDate(tag.text());
|
||||
maybeDate = PubDateParser.attemptParseDate(tag.wholeText());
|
||||
if (maybeDate.isPresent()) {
|
||||
return maybeDate;
|
||||
}
|
||||
|
@ -0,0 +1,45 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
import java.util.OptionalInt;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
|
||||
|
||||
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
|
||||
|
||||
// False positive rate is much higher in the 1990s, only include 2000s+ in pass 1
|
||||
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||
final String urlString = url.path;
|
||||
|
||||
var matcher = yearUrlPattern.matcher(urlString);
|
||||
|
||||
for (int i = 0; i < urlString.length() && matcher.find(i); i = matcher.end()) {
|
||||
|
||||
String segment = urlString.substring(matcher.start() + 1, matcher.end() - 1);
|
||||
|
||||
OptionalInt year = PubDateParser.parseYearString(segment);
|
||||
|
||||
if (year.isEmpty())
|
||||
continue;
|
||||
|
||||
int y = year.getAsInt();
|
||||
if (y >= MIN_URL_PATTERN_YEAR && y <= PubDate.MAX_YEAR) {
|
||||
return Optional.of(new PubDate(null, y));
|
||||
}
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
@ -12,7 +12,7 @@ import java.util.Optional;
|
||||
import java.util.OptionalInt;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class PubDateHeuristicUrlPattern implements PubDateHeuristic {
|
||||
public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
|
||||
|
||||
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
|
||||
|
@ -4,13 +4,14 @@ public enum EdgeHtmlStandard {
|
||||
PLAIN(0, 1, 1993),
|
||||
UNKNOWN(0, 1, 2000),
|
||||
HTML123(0, 1, 1997),
|
||||
HTML4(-0.1, 1.05, 2008),
|
||||
XHTML(-0.1, 1.05, 2005),
|
||||
HTML4(-0.1, 1.05, 2006),
|
||||
XHTML(-0.1, 1.05, 2006),
|
||||
HTML5(0.5, 1.1, 2018);
|
||||
|
||||
public final double offset;
|
||||
public final double scale;
|
||||
public int yearGuess;
|
||||
|
||||
public final int yearGuess;
|
||||
|
||||
EdgeHtmlStandard(double offset, double scale, int yearGuess) {
|
||||
this.offset = offset;
|
||||
|
@ -2,12 +2,16 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
@ -53,6 +57,10 @@ class PubDateSnifferTest {
|
||||
assertEquals("2018-10-21", ret.get().dateIso8601());
|
||||
assertEquals(2018, ret.get().year());
|
||||
|
||||
ret = PubDateParser.attemptParseDate("July 13, 2006");
|
||||
assertTrue(ret.isPresent());
|
||||
assertEquals(2006, ret.get().year());
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -89,6 +97,39 @@ class PubDateSnifferTest {
|
||||
assertEquals("2022-08-24", ret.dateIso8601());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHtml5C() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
<html>
|
||||
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
||||
Wow, sure lor 'em boss
|
||||
</article>
|
||||
"""), EdgeHtmlStandard.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals(2006, ret.year());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProblemCases() throws IOException, URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/The Switch to Linux Begins .html"))), EdgeHtmlStandard.HTML5, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals(2006, ret.year());
|
||||
|
||||
ret = dateSniffer.getPubDate("",
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), EdgeHtmlStandard.XHTML, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals(2010, ret.year());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGuessYear() {
|
||||
System.out.println(PubDateParser.guessYear(2010, 2020));
|
||||
@ -168,6 +209,8 @@ class PubDateSnifferTest {
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals("2022-02-03", ret.dateIso8601());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDOM() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
@ -183,6 +226,17 @@ class PubDateSnifferTest {
|
||||
assertEquals(2015, ret.year());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCandidate() {
|
||||
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 2007"));
|
||||
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 2007-01-01"));
|
||||
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 01-01.2007"));
|
||||
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("Only $1999"));
|
||||
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("1998B"));
|
||||
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("1998B"));
|
||||
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("2010 black hat ™"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOldInvision() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
|
Loading…
Reference in New Issue
Block a user