diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index 94fafe29..a5fa2d0d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -103,6 +103,8 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial )); } + private CrawledDocumentParquetRecord previousRecord = null; + private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) { String bodyString = ""; CrawlerDocumentStatus status = CrawlerDocumentStatus.OK; @@ -130,6 +132,24 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial status = CrawlerDocumentStatus.ERROR; } + String etag = nextRecord.etagHeader; + String lastModified = nextRecord.lastModifiedHeader; + + // If we have a previous record, and it was a 304, and this one is a 200, we'll use the ETag and Last-Modified + // from the previous record, as it's not guaranteed the reference copy will have the same headers due to a bug + // in the crawler. The bug is fixed, but we still need to support old crawls. + // + // This was added in 2024-01-18, so we can remove it in a few months. + + if (previousRecord != null + && previousRecord.url.equals(nextRecord.url) + && previousRecord.httpStatus == 304 + && nextRecord.httpStatus == 200) + { + etag = previousRecord.etagHeader; + lastModified = previousRecord.lastModifiedHeader; + } + nextQ.add(new CrawledDocument("", nextRecord.url, nextRecord.contentType, @@ -144,11 +164,14 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial null, "", nextRecord.cookies, - nextRecord.lastModifiedHeader, - nextRecord.etagHeader)); + lastModified, + etag)); + + previousRecord = nextRecord; } public void close() throws IOException { + previousRecord = null; } @Override diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index 9bd14ab6..aa9837cf 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -2,7 +2,6 @@ package nu.marginalia.crawl.retreival.fetcher.warc; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.model.EdgeDomain; @@ -255,15 +254,6 @@ public class WarcRecorder implements AutoCloseable { } } - /** - * Flag the given URL as skipped by the crawler, so that it will not be retried. - * Which URLs were skipped is still important when resynchronizing on the WARC file, - * so that the crawler can avoid re-fetching them. - */ - public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) { - saveOldResponse(url, contentType, statusCode, documentBody, ContentTags.empty()); - } - /** * Write a reference copy of the given document data. This is used when the crawler provides * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index c0577925..a21a06df 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -5,6 +5,7 @@ import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlDelayTimer; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainCrawlFrontier; +import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.model.EdgeUrl; @@ -84,7 +85,12 @@ public class CrawlerRevisitor { } // Add a WARC record so we don't repeat this - warcRecorder.flagAsSkipped(url, doc.contentType, doc.httpStatus, doc.documentBody); + warcRecorder.writeReferenceCopy(url, + doc.contentType, + doc.httpStatus, + doc.documentBody, + new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe) + ); } else { // GET the document with the stored document as a reference diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index b0385182..206bf798 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -73,10 +73,11 @@ class WarcRecorderTest { public void flagAsSkipped() throws IOException, URISyntaxException { try (var recorder = new WarcRecorder(fileNameWarc)) { - recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), + recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"), "text/html", 200, - "test"); + "test", + ContentTags.empty()); } try (var reader = new WarcReader(fileNameWarc)) { @@ -95,10 +96,11 @@ class WarcRecorderTest { public void flagAsSkippedNullBody() throws IOException, URISyntaxException { try (var recorder = new WarcRecorder(fileNameWarc)) { - recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), + recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"), "text/html", 200, - null); + null, + ContentTags.empty()); } } @@ -106,10 +108,11 @@ class WarcRecorderTest { @Test public void testSaveImport() throws URISyntaxException, IOException { try (var recorder = new WarcRecorder(fileNameWarc)) { - recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), + recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"), "text/html", 200, - "test"); + "test", + ContentTags.empty()); } try (var reader = new WarcReader(fileNameWarc)) {