diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index 5a993fda..26ba8fe2 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -124,7 +124,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { domain, response.target(), fetchOk.ipAddress(), - false, // FIXME + WarcXCookieInformationHeader.hasCookies(response), fetchOk.statusCode(), contentType, bodyBytes) diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java new file mode 100644 index 00000000..7d983580 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java @@ -0,0 +1,35 @@ +package org.netpreserve.jwarc; + +import okhttp3.HttpUrl; +import okhttp3.OkHttpClient; + +/** Encapsulates out-of-band information about whether a website uses cookies, + * using a non-standard WARC header "X-Has-Cookies". + */ +public class WarcXCookieInformationHeader { + private boolean hasCookies = false; + private static final String headerName = "X-Has-Cookies"; + + public void update(OkHttpClient client, HttpUrl url) { + if (!hasCookies) { + hasCookies = !client.cookieJar().loadForRequest(url).isEmpty(); + } + } + + public boolean hasCookies() { + return hasCookies; + } + + public void paint(WarcResponse.Builder builder) { + builder.addHeader(headerName, hasCookies ? "1" : "0"); + } + public void paint(WarcXResponseReference.Builder builder) { + builder.addHeader(headerName, hasCookies ? "1" : "0"); + } + + public static boolean hasCookies(WarcRecord record) { + return record.headers().contains(headerName, "1"); + } + + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index 5ccfacb5..e31585ef 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -43,6 +43,11 @@ public class WarcRecorder implements AutoCloseable { // in some way private final String warcRecorderVersion = "1.0"; + // We need to know if the site uses cookies so this can be reported among the search results + // -- flip this to true if we see any cookies. This information will also be painted on any + // revisited pages. It's not 100% perfect and a bit order dependent, but it's good enough. + private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader(); + /** * Create a new WarcRecorder that will write to the given file * @@ -86,7 +91,7 @@ public class WarcRecorder implements AutoCloseable { ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(); - boolean hasCookies = !client.cookieJar().loadForRequest(request.url()).isEmpty(); + cookieInformation.update(client, request.url()); try (var response = call.execute()) { var body = response.body(); @@ -140,10 +145,11 @@ public class WarcRecorder implements AutoCloseable { WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri) .blockDigest(responseDigestBuilder.build()) - .addHeader("X-Has-Cookies", hasCookies ? "1" : "0") .date(date) .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); + cookieInformation.paint(responseBuilder); + if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip)); responseBuilder.payloadDigest(payloadDigestBuilder.build()); @@ -215,16 +221,20 @@ public class WarcRecorder implements AutoCloseable { payloadDigestBuilder.update(bytes, bytes.length); responseDataBuffer.put(bytes, 0, bytes.length); - WarcXResponseReference reference = new WarcXResponseReference.Builder(url.asURI()) + WarcXResponseReference.Builder builder = new WarcXResponseReference.Builder(url.asURI()) .blockDigest(responseDigestBuilder.build()) .payloadDigest(payloadDigestBuilder.build()) .date(Instant.now()) - .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()) - .build(); + .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); + + cookieInformation.paint(builder); + + var reference = builder.build(); reference.http(); // force HTTP header to be parsed before body is consumed so that caller can use it writer.write(reference); + } catch (URISyntaxException | IOException | NoSuchAlgorithmException e) { throw new RuntimeException(e); }