Improved publishing date heuristics
This commit is contained in:
parent
68cde1c3d8
commit
217584126c
@ -6,8 +6,10 @@ import java.time.format.DateTimeFormatter;
|
|||||||
public record PubDate(String dateIso8601, int year) {
|
public record PubDate(String dateIso8601, int year) {
|
||||||
|
|
||||||
// First year we'll believe something can have been published on the web
|
// First year we'll believe something can have been published on the web
|
||||||
// ... Tim Berners Lee's recipe collection or something
|
// cut off at 1995 to reduce false positive error rate; number of bona fide
|
||||||
public static final int MIN_YEAR = 1989;
|
// documents from these years are so few almost all hits are wrong
|
||||||
|
|
||||||
|
public static final int MIN_YEAR = 1995;
|
||||||
|
|
||||||
// Last year we'll believe something can be published in
|
// Last year we'll believe something can be published in
|
||||||
public static final int MAX_YEAR = LocalDate.now().getYear() + 1;
|
public static final int MAX_YEAR = LocalDate.now().getYear() + 1;
|
||||||
|
@ -9,15 +9,11 @@ import java.time.ZonedDateTime;
|
|||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.OptionalInt;
|
import java.util.OptionalInt;
|
||||||
import java.util.Random;
|
|
||||||
import java.util.concurrent.ThreadLocalRandom;
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class PubDateParser {
|
public class PubDateParser {
|
||||||
|
|
||||||
// ThreadLocalRandom lacks a few methods we need out of Random
|
|
||||||
private static ThreadLocal<Random> localRandom = ThreadLocal.withInitial(Random::new);
|
|
||||||
|
|
||||||
public static Optional<PubDate> attemptParseDate(String date) {
|
public static Optional<PubDate> attemptParseDate(String date) {
|
||||||
return Optional.ofNullable(date)
|
return Optional.ofNullable(date)
|
||||||
.filter(str -> str.length() >= 4 && str.length() < 32)
|
.filter(str -> str.length() >= 4 && str.length() < 32)
|
||||||
@ -66,7 +62,7 @@ public class PubDateParser {
|
|||||||
return Optional.of(new PubDate(null, guessYear(min, max)));
|
return Optional.of(new PubDate(null, guessYear(min, max)));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (max > PubDate.MIN_YEAR)
|
if (max >= PubDate.MIN_YEAR)
|
||||||
return Optional.of(new PubDate(null, max));
|
return Optional.of(new PubDate(null, max));
|
||||||
else
|
else
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
@ -98,7 +94,7 @@ public class PubDateParser {
|
|||||||
return Optional.of(new PubDate(null, guessYear(min, max, guess)));
|
return Optional.of(new PubDate(null, guessYear(min, max, guess)));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (max > PubDate.MIN_YEAR)
|
if (max >= PubDate.MIN_YEAR)
|
||||||
return Optional.of(new PubDate(null, max));
|
return Optional.of(new PubDate(null, max));
|
||||||
else
|
else
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
@ -21,12 +21,15 @@ public class PubDateSniffer {
|
|||||||
heuristics.add(new PubDateHeuristicRDFaTag());
|
heuristics.add(new PubDateHeuristicRDFaTag());
|
||||||
|
|
||||||
// The more questionable heuristics should be kept below this line
|
// The more questionable heuristics should be kept below this line
|
||||||
|
heuristics.add(new PubDateHeuristicUrlPatternPass1());
|
||||||
|
|
||||||
heuristics.add(new PubDateHeuristicUrlPattern());
|
heuristics.add(new PubDateHeuristicDOMParsingPass1());
|
||||||
heuristics.add(new PubDateHeuristicHtml5AnyTimeTag());
|
heuristics.add(new PubDateHeuristicHtml5AnyTimeTag());
|
||||||
heuristics.add(new PubDateHeuristicDOMParsing());
|
|
||||||
heuristics.add(new PubDateHeuristicLastModified());
|
|
||||||
|
|
||||||
|
heuristics.add(new PubDateHeuristicDOMParsingPass2());
|
||||||
|
heuristics.add(new PubDateHeuristicUrlPatternPass2());
|
||||||
|
|
||||||
|
heuristics.add(new PubDateHeuristicLastModified());
|
||||||
// This is complete guesswork
|
// This is complete guesswork
|
||||||
|
|
||||||
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
||||||
|
@ -15,14 +15,14 @@ import org.jsoup.select.NodeFilter;
|
|||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
|
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
if (effortLevel == PubDateEffortLevel.LOW)
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
DateExtractingNodeVisitor filter = new DateExtractingNodeVisitor(htmlStandard);
|
DateExtractingNodeVisitorPass filter = new DateExtractingNodeVisitorPass(htmlStandard);
|
||||||
|
|
||||||
document.filter(filter);
|
document.filter(filter);
|
||||||
|
|
||||||
@ -30,11 +30,11 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static class DateExtractingNodeVisitor implements NodeFilter {
|
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
||||||
public PubDate pubDate;
|
public PubDate pubDate;
|
||||||
private final EdgeHtmlStandard htmlStandard;
|
private final EdgeHtmlStandard htmlStandard;
|
||||||
|
|
||||||
private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) {
|
private DateExtractingNodeVisitorPass(EdgeHtmlStandard htmlStandard) {
|
||||||
this.htmlStandard = htmlStandard;
|
this.htmlStandard = htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -53,11 +53,12 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
|
|||||||
public void onTextNode(TextNode tn) {
|
public void onTextNode(TextNode tn) {
|
||||||
String text = tn.getWholeText();
|
String text = tn.getWholeText();
|
||||||
|
|
||||||
if (isCandidatForCopyrightNotice(text)) {
|
if (text.length() < 32 && isCandidatForCopyrightNotice(text)) {
|
||||||
parse(text);
|
parse(text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void onElementNode(Element el) {
|
public void onElementNode(Element el) {
|
||||||
if (hasCommonClass(el)) {
|
if (hasCommonClass(el)) {
|
||||||
parse(el.text());
|
parse(el.text());
|
||||||
@ -89,7 +90,7 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
|
|||||||
|| classes.contains("byline")
|
|| classes.contains("byline")
|
||||||
|| classes.contains("author")
|
|| classes.contains("author")
|
||||||
|| classes.contains("submitted")
|
|| classes.contains("submitted")
|
||||||
|| classes.contains("footer-info-lastmod"); // mediawiki
|
|| el.id().contains("footer-info-lastmod"); // mediawiki
|
||||||
}
|
}
|
||||||
|
|
||||||
public void tryParsePhpBBDate(Element el) {
|
public void tryParsePhpBBDate(Element el) {
|
||||||
@ -144,5 +145,4 @@ public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
@ -0,0 +1,123 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Node;
|
||||||
|
import org.jsoup.nodes.TextNode;
|
||||||
|
import org.jsoup.select.NodeFilter;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
DateExtractingNodeVisitor filter = new DateExtractingNodeVisitor(htmlStandard);
|
||||||
|
|
||||||
|
document.filter(filter);
|
||||||
|
|
||||||
|
return Optional.ofNullable(filter.pubDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static class DateExtractingNodeVisitor implements NodeFilter {
|
||||||
|
public PubDate pubDate;
|
||||||
|
private final EdgeHtmlStandard htmlStandard;
|
||||||
|
|
||||||
|
private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) {
|
||||||
|
this.htmlStandard = htmlStandard;
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
@Override
|
||||||
|
public FilterResult head(@NotNull Node node, int depth) {
|
||||||
|
if (node instanceof TextNode tn) onTextNode(tn);
|
||||||
|
|
||||||
|
if (hasPubDate()) {
|
||||||
|
return FilterResult.STOP;
|
||||||
|
}
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTextNode(TextNode tn) {
|
||||||
|
String text = tn.getWholeText();
|
||||||
|
|
||||||
|
if (isPossibleCandidate(text)) {
|
||||||
|
parse(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean hasPubDate() {
|
||||||
|
return pubDate != null;
|
||||||
|
}
|
||||||
|
public void setPubDate(PubDate pubDate) {
|
||||||
|
this.pubDate = pubDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
@Override
|
||||||
|
public FilterResult tail(@NotNull Node node, int depth) {
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void parse(String text) {
|
||||||
|
if (htmlStandard == EdgeHtmlStandard.UNKNOWN) {
|
||||||
|
PubDateParser
|
||||||
|
.dateFromHighestYearLookingSubstring(text)
|
||||||
|
.ifPresent(this::setPubDate);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
PubDateParser
|
||||||
|
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
|
||||||
|
.ifPresent(this::setPubDate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is basically the regex (^|[ ./\-])(\d{4})([ ./\-]$), but
|
||||||
|
// unchecked regexes are too slow
|
||||||
|
|
||||||
|
public static boolean isPossibleCandidate(String text) {
|
||||||
|
if (text.length() >= 4 && text.length() < 24) {
|
||||||
|
int ct = 0;
|
||||||
|
char prevC = ' ';
|
||||||
|
boolean goodStart = true;
|
||||||
|
for (int i = 0; i < text.length(); i++) {
|
||||||
|
char c = text.charAt(i);
|
||||||
|
if (Character.isDigit(c)) {
|
||||||
|
if (ct++ == 0) {
|
||||||
|
goodStart = isGoodBreak(prevC);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (ct == 4 && goodStart && isGoodBreak(c)) return true;
|
||||||
|
else {
|
||||||
|
ct = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
prevC = c;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ct == 4 && goodStart)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isGoodBreak(char c) {
|
||||||
|
return "./-,".indexOf(c) >= 0 || Character.isSpaceChar(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -21,7 +21,7 @@ public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
|||||||
return maybeDate;
|
return maybeDate;
|
||||||
}
|
}
|
||||||
|
|
||||||
maybeDate = PubDateParser.attemptParseDate(tag.text());
|
maybeDate = PubDateParser.attemptParseDate(tag.wholeText());
|
||||||
if (maybeDate.isPresent()) {
|
if (maybeDate.isPresent()) {
|
||||||
return maybeDate;
|
return maybeDate;
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,45 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.OptionalInt;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
|
||||||
|
|
||||||
|
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
|
||||||
|
|
||||||
|
// False positive rate is much higher in the 1990s, only include 2000s+ in pass 1
|
||||||
|
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
final String urlString = url.path;
|
||||||
|
|
||||||
|
var matcher = yearUrlPattern.matcher(urlString);
|
||||||
|
|
||||||
|
for (int i = 0; i < urlString.length() && matcher.find(i); i = matcher.end()) {
|
||||||
|
|
||||||
|
String segment = urlString.substring(matcher.start() + 1, matcher.end() - 1);
|
||||||
|
|
||||||
|
OptionalInt year = PubDateParser.parseYearString(segment);
|
||||||
|
|
||||||
|
if (year.isEmpty())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int y = year.getAsInt();
|
||||||
|
if (y >= MIN_URL_PATTERN_YEAR && y <= PubDate.MAX_YEAR) {
|
||||||
|
return Optional.of(new PubDate(null, y));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
@ -12,7 +12,7 @@ import java.util.Optional;
|
|||||||
import java.util.OptionalInt;
|
import java.util.OptionalInt;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class PubDateHeuristicUrlPattern implements PubDateHeuristic {
|
public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
|
||||||
|
|
||||||
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
|
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
|
||||||
|
|
@ -4,13 +4,14 @@ public enum EdgeHtmlStandard {
|
|||||||
PLAIN(0, 1, 1993),
|
PLAIN(0, 1, 1993),
|
||||||
UNKNOWN(0, 1, 2000),
|
UNKNOWN(0, 1, 2000),
|
||||||
HTML123(0, 1, 1997),
|
HTML123(0, 1, 1997),
|
||||||
HTML4(-0.1, 1.05, 2008),
|
HTML4(-0.1, 1.05, 2006),
|
||||||
XHTML(-0.1, 1.05, 2005),
|
XHTML(-0.1, 1.05, 2006),
|
||||||
HTML5(0.5, 1.1, 2018);
|
HTML5(0.5, 1.1, 2018);
|
||||||
|
|
||||||
public final double offset;
|
public final double offset;
|
||||||
public final double scale;
|
public final double scale;
|
||||||
public int yearGuess;
|
|
||||||
|
public final int yearGuess;
|
||||||
|
|
||||||
EdgeHtmlStandard(double offset, double scale, int yearGuess) {
|
EdgeHtmlStandard(double offset, double scale, int yearGuess) {
|
||||||
this.offset = offset;
|
this.offset = offset;
|
||||||
|
@ -2,12 +2,16 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
|||||||
|
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
@ -53,6 +57,10 @@ class PubDateSnifferTest {
|
|||||||
assertEquals("2018-10-21", ret.get().dateIso8601());
|
assertEquals("2018-10-21", ret.get().dateIso8601());
|
||||||
assertEquals(2018, ret.get().year());
|
assertEquals(2018, ret.get().year());
|
||||||
|
|
||||||
|
ret = PubDateParser.attemptParseDate("July 13, 2006");
|
||||||
|
assertTrue(ret.isPresent());
|
||||||
|
assertEquals(2006, ret.get().year());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -89,6 +97,39 @@ class PubDateSnifferTest {
|
|||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHtml5C() throws URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse("""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
||||||
|
Wow, sure lor 'em boss
|
||||||
|
</article>
|
||||||
|
"""), EdgeHtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals(2006, ret.year());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProblemCases() throws IOException, URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/The Switch to Linux Begins .html"))), EdgeHtmlStandard.HTML5, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals(2006, ret.year());
|
||||||
|
|
||||||
|
ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), EdgeHtmlStandard.XHTML, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals(2010, ret.year());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGuessYear() {
|
public void testGuessYear() {
|
||||||
System.out.println(PubDateParser.guessYear(2010, 2020));
|
System.out.println(PubDateParser.guessYear(2010, 2020));
|
||||||
@ -168,6 +209,8 @@ class PubDateSnifferTest {
|
|||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-02-03", ret.dateIso8601());
|
assertEquals("2022-02-03", ret.dateIso8601());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testDOM() throws URISyntaxException {
|
public void testDOM() throws URISyntaxException {
|
||||||
var ret = dateSniffer.getPubDate("",
|
var ret = dateSniffer.getPubDate("",
|
||||||
@ -183,6 +226,17 @@ class PubDateSnifferTest {
|
|||||||
assertEquals(2015, ret.year());
|
assertEquals(2015, ret.year());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCandidate() {
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 2007"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 2007-01-01"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 01-01.2007"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("Only $1999"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("1998B"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("1998B"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("2010 black hat ™"));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testOldInvision() throws URISyntaxException {
|
public void testOldInvision() throws URISyntaxException {
|
||||||
var ret = dateSniffer.getPubDate("",
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
Loading…
Reference in New Issue
Block a user