(crawler) Fix a bug where reference copies of crawl data was written without etag and last-modified

This commit also adds a band-aid to ParquetSerializableCrawlDataStream to fetch this from the 304-entity.  This can be removed in a few months.
This commit is contained in:
Viktor Lofgren 2024-01-18 16:02:27 +01:00
parent 964419803a
commit 22c8fb3f59
4 changed files with 41 additions and 19 deletions

View File

@ -103,6 +103,8 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
));
}
private CrawledDocumentParquetRecord previousRecord = null;
private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
String bodyString = "";
CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
@ -130,6 +132,24 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
status = CrawlerDocumentStatus.ERROR;
}
String etag = nextRecord.etagHeader;
String lastModified = nextRecord.lastModifiedHeader;
// If we have a previous record, and it was a 304, and this one is a 200, we'll use the ETag and Last-Modified
// from the previous record, as it's not guaranteed the reference copy will have the same headers due to a bug
// in the crawler. The bug is fixed, but we still need to support old crawls.
//
// This was added in 2024-01-18, so we can remove it in a few months.
if (previousRecord != null
&& previousRecord.url.equals(nextRecord.url)
&& previousRecord.httpStatus == 304
&& nextRecord.httpStatus == 200)
{
etag = previousRecord.etagHeader;
lastModified = previousRecord.lastModifiedHeader;
}
nextQ.add(new CrawledDocument("",
nextRecord.url,
nextRecord.contentType,
@ -144,11 +164,14 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
null,
"",
nextRecord.cookies,
nextRecord.lastModifiedHeader,
nextRecord.etagHeader));
lastModified,
etag));
previousRecord = nextRecord;
}
public void close() throws IOException {
previousRecord = null;
}
@Override

View File

@ -2,7 +2,6 @@ package nu.marginalia.crawl.retreival.fetcher.warc;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.model.EdgeDomain;
@ -255,15 +254,6 @@ public class WarcRecorder implements AutoCloseable {
}
}
/**
* Flag the given URL as skipped by the crawler, so that it will not be retried.
* Which URLs were skipped is still important when resynchronizing on the WARC file,
* so that the crawler can avoid re-fetching them.
*/
public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) {
saveOldResponse(url, contentType, statusCode, documentBody, ContentTags.empty());
}
/**
* Write a reference copy of the given document data. This is used when the crawler provides
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this

View File

@ -5,6 +5,7 @@ import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeUrl;
@ -84,7 +85,12 @@ public class CrawlerRevisitor {
}
// Add a WARC record so we don't repeat this
warcRecorder.flagAsSkipped(url, doc.contentType, doc.httpStatus, doc.documentBody);
warcRecorder.writeReferenceCopy(url,
doc.contentType,
doc.httpStatus,
doc.documentBody,
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
);
}
else {
// GET the document with the stored document as a reference

View File

@ -73,10 +73,11 @@ class WarcRecorderTest {
public void flagAsSkipped() throws IOException, URISyntaxException {
try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html",
200,
"<?doctype html><html><body>test</body></html>");
"<?doctype html><html><body>test</body></html>",
ContentTags.empty());
}
try (var reader = new WarcReader(fileNameWarc)) {
@ -95,10 +96,11 @@ class WarcRecorderTest {
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html",
200,
null);
null,
ContentTags.empty());
}
}
@ -106,10 +108,11 @@ class WarcRecorderTest {
@Test
public void testSaveImport() throws URISyntaxException, IOException {
try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html",
200,
"<?doctype html><html><body>test</body></html>");
"<?doctype html><html><body>test</body></html>",
ContentTags.empty());
}
try (var reader = new WarcReader(fileNameWarc)) {