(crawler) Write etags and last-modified on reference copy
This commit also fixes a test that broke with a previous change.
This commit is contained in:
parent
67ef2b45fa
commit
f18f82e229
@ -283,7 +283,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
||||
var doc = reference.doc();
|
||||
|
||||
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody);
|
||||
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, contentTags);
|
||||
|
||||
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||
new ContentType(doc.contentType, "UTF-8"),
|
||||
|
@ -1,6 +1,8 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher.warc;
|
||||
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -198,7 +200,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
writer.write(item);
|
||||
}
|
||||
|
||||
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody) {
|
||||
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags contentTags) {
|
||||
try {
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
@ -212,13 +214,19 @@ public class WarcRecorder implements AutoCloseable {
|
||||
bytes = documentBody.getBytes();
|
||||
}
|
||||
|
||||
String fakeHeaders = STR."""
|
||||
Content-Type: \{contentType}
|
||||
Content-Length: \{bytes.length}
|
||||
Content-Encoding: UTF-8
|
||||
""";
|
||||
StringJoiner fakeHeadersBuilder = new StringJoiner("\n");
|
||||
|
||||
String header = WarcProtocolReconstructor.getResponseHeader(fakeHeaders, statusCode);
|
||||
fakeHeadersBuilder.add(STR."Content-Type: \{contentType}");
|
||||
fakeHeadersBuilder.add(STR."Content-Length: \{bytes.length}");
|
||||
fakeHeadersBuilder.add(STR."Content-Encoding: UTF-8");
|
||||
if (contentTags.etag() != null) {
|
||||
fakeHeadersBuilder.add(STR."ETag: \{contentTags.etag()}");
|
||||
}
|
||||
if (contentTags.lastMod() != null) {
|
||||
fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}");
|
||||
}
|
||||
|
||||
String header = WarcProtocolReconstructor.getResponseHeader(fakeHeadersBuilder.toString(), statusCode);
|
||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer();
|
||||
responseDataBuffer.put(header);
|
||||
|
||||
@ -253,7 +261,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
* so that the crawler can avoid re-fetching them.
|
||||
*/
|
||||
public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) {
|
||||
saveOldResponse(url, contentType, statusCode, documentBody);
|
||||
saveOldResponse(url, contentType, statusCode, documentBody, ContentTags.empty());
|
||||
}
|
||||
|
||||
/**
|
||||
@ -261,8 +269,8 @@ public class WarcRecorder implements AutoCloseable {
|
||||
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
||||
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
|
||||
*/
|
||||
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody) {
|
||||
saveOldResponse(url, contentType, statusCode, documentBody);
|
||||
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags ctags) {
|
||||
saveOldResponse(url, contentType, statusCode, documentBody, ctags);
|
||||
}
|
||||
|
||||
public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException {
|
||||
|
@ -48,6 +48,8 @@ public class DocumentWithReferenceTest {
|
||||
CrawledDocument doc = CrawledDocument.builder()
|
||||
.etagMaybe("12345")
|
||||
.lastModifiedMaybe("67890")
|
||||
.documentBody("Test")
|
||||
.httpStatus(200)
|
||||
.build(); // assume lastModified and eTag are not null
|
||||
CrawlDataReference reference = new CrawlDataReference();
|
||||
|
||||
@ -70,6 +72,8 @@ public class DocumentWithReferenceTest {
|
||||
Etag: 12345
|
||||
Last-Modified: 67890
|
||||
""")
|
||||
.documentBody("Test")
|
||||
.httpStatus(200)
|
||||
.build(); // assume lastModified and eTag are not null
|
||||
CrawlDataReference reference = new CrawlDataReference();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user