(crawler) Fix a bug where reference copies of crawl data was written without etag and last-modified
This commit also adds a band-aid to ParquetSerializableCrawlDataStream to fetch this from the 304-entity. This can be removed in a few months.
This commit is contained in:
parent
964419803a
commit
22c8fb3f59
@ -103,6 +103,8 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
));
|
||||
}
|
||||
|
||||
private CrawledDocumentParquetRecord previousRecord = null;
|
||||
|
||||
private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
|
||||
String bodyString = "";
|
||||
CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
|
||||
@ -130,6 +132,24 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
status = CrawlerDocumentStatus.ERROR;
|
||||
}
|
||||
|
||||
String etag = nextRecord.etagHeader;
|
||||
String lastModified = nextRecord.lastModifiedHeader;
|
||||
|
||||
// If we have a previous record, and it was a 304, and this one is a 200, we'll use the ETag and Last-Modified
|
||||
// from the previous record, as it's not guaranteed the reference copy will have the same headers due to a bug
|
||||
// in the crawler. The bug is fixed, but we still need to support old crawls.
|
||||
//
|
||||
// This was added in 2024-01-18, so we can remove it in a few months.
|
||||
|
||||
if (previousRecord != null
|
||||
&& previousRecord.url.equals(nextRecord.url)
|
||||
&& previousRecord.httpStatus == 304
|
||||
&& nextRecord.httpStatus == 200)
|
||||
{
|
||||
etag = previousRecord.etagHeader;
|
||||
lastModified = previousRecord.lastModifiedHeader;
|
||||
}
|
||||
|
||||
nextQ.add(new CrawledDocument("",
|
||||
nextRecord.url,
|
||||
nextRecord.contentType,
|
||||
@ -144,11 +164,14 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
null,
|
||||
"",
|
||||
nextRecord.cookies,
|
||||
nextRecord.lastModifiedHeader,
|
||||
nextRecord.etagHeader));
|
||||
lastModified,
|
||||
etag));
|
||||
|
||||
previousRecord = nextRecord;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
previousRecord = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.crawl.retreival.fetcher.warc;
|
||||
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -255,15 +254,6 @@ public class WarcRecorder implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flag the given URL as skipped by the crawler, so that it will not be retried.
|
||||
* Which URLs were skipped is still important when resynchronizing on the WARC file,
|
||||
* so that the crawler can avoid re-fetching them.
|
||||
*/
|
||||
public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) {
|
||||
saveOldResponse(url, contentType, statusCode, documentBody, ContentTags.empty());
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a reference copy of the given document data. This is used when the crawler provides
|
||||
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@ -84,7 +85,12 @@ public class CrawlerRevisitor {
|
||||
}
|
||||
|
||||
// Add a WARC record so we don't repeat this
|
||||
warcRecorder.flagAsSkipped(url, doc.contentType, doc.httpStatus, doc.documentBody);
|
||||
warcRecorder.writeReferenceCopy(url,
|
||||
doc.contentType,
|
||||
doc.httpStatus,
|
||||
doc.documentBody,
|
||||
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
|
||||
);
|
||||
}
|
||||
else {
|
||||
// GET the document with the stored document as a reference
|
||||
|
@ -73,10 +73,11 @@ class WarcRecorderTest {
|
||||
public void flagAsSkipped() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>");
|
||||
"<?doctype html><html><body>test</body></html>",
|
||||
ContentTags.empty());
|
||||
}
|
||||
|
||||
try (var reader = new WarcReader(fileNameWarc)) {
|
||||
@ -95,10 +96,11 @@ class WarcRecorderTest {
|
||||
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
"text/html",
|
||||
200,
|
||||
null);
|
||||
null,
|
||||
ContentTags.empty());
|
||||
}
|
||||
|
||||
}
|
||||
@ -106,10 +108,11 @@ class WarcRecorderTest {
|
||||
@Test
|
||||
public void testSaveImport() throws URISyntaxException, IOException {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>");
|
||||
"<?doctype html><html><body>test</body></html>",
|
||||
ContentTags.empty());
|
||||
}
|
||||
|
||||
try (var reader = new WarcReader(fileNameWarc)) {
|
||||
|
Loading…
Reference in New Issue
Block a user