(warc) Use a non-standard WARC header to convey information about whether a website uses cookies

This information is then propagated to the parquet file as a boolean.

For documents that are copied from the reference, use whatever value we last saw.  This isn't 100% deterministic and may result in false negatives, but permits websites that used cookies but have stopped to repent and have the change reflect in the search engine more quickly.
This commit is contained in:
Viktor Lofgren 2023-12-15 16:37:53 +01:00
parent 9fea22b90d
commit fa81e5b8ee
3 changed files with 51 additions and 6 deletions

View File

@ -124,7 +124,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
domain,
response.target(),
fetchOk.ipAddress(),
false, // FIXME
WarcXCookieInformationHeader.hasCookies(response),
fetchOk.statusCode(),
contentType,
bodyBytes)

View File

@ -0,0 +1,35 @@
package org.netpreserve.jwarc;
import okhttp3.HttpUrl;
import okhttp3.OkHttpClient;
/** Encapsulates out-of-band information about whether a website uses cookies,
* using a non-standard WARC header "X-Has-Cookies".
*/
public class WarcXCookieInformationHeader {
private boolean hasCookies = false;
private static final String headerName = "X-Has-Cookies";
public void update(OkHttpClient client, HttpUrl url) {
if (!hasCookies) {
hasCookies = !client.cookieJar().loadForRequest(url).isEmpty();
}
}
public boolean hasCookies() {
return hasCookies;
}
public void paint(WarcResponse.Builder builder) {
builder.addHeader(headerName, hasCookies ? "1" : "0");
}
public void paint(WarcXResponseReference.Builder builder) {
builder.addHeader(headerName, hasCookies ? "1" : "0");
}
public static boolean hasCookies(WarcRecord record) {
return record.headers().contains(headerName, "1");
}
}

View File

@ -43,6 +43,11 @@ public class WarcRecorder implements AutoCloseable {
// in some way
private final String warcRecorderVersion = "1.0";
// We need to know if the site uses cookies so this can be reported among the search results
// -- flip this to true if we see any cookies. This information will also be painted on any
// revisited pages. It's not 100% perfect and a bit order dependent, but it's good enough.
private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader();
/**
* Create a new WarcRecorder that will write to the given file
*
@ -86,7 +91,7 @@ public class WarcRecorder implements AutoCloseable {
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer();
boolean hasCookies = !client.cookieJar().loadForRequest(request.url()).isEmpty();
cookieInformation.update(client, request.url());
try (var response = call.execute()) {
var body = response.body();
@ -140,10 +145,11 @@ public class WarcRecorder implements AutoCloseable {
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
.blockDigest(responseDigestBuilder.build())
.addHeader("X-Has-Cookies", hasCookies ? "1" : "0")
.date(date)
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
cookieInformation.paint(responseBuilder);
if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip));
responseBuilder.payloadDigest(payloadDigestBuilder.build());
@ -215,16 +221,20 @@ public class WarcRecorder implements AutoCloseable {
payloadDigestBuilder.update(bytes, bytes.length);
responseDataBuffer.put(bytes, 0, bytes.length);
WarcXResponseReference reference = new WarcXResponseReference.Builder(url.asURI())
WarcXResponseReference.Builder builder = new WarcXResponseReference.Builder(url.asURI())
.blockDigest(responseDigestBuilder.build())
.payloadDigest(payloadDigestBuilder.build())
.date(Instant.now())
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes())
.build();
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
cookieInformation.paint(builder);
var reference = builder.build();
reference.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
writer.write(reference);
} catch (URISyntaxException | IOException | NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}