From 929caed0b972f3a073a744d7532aa2055b42e67e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 9 Feb 2024 20:07:01 +0100 Subject: [PATCH] (warc) Improve WARC standard adherence The WARC specification says the records should transparently remove compression. This was not done, leading to the WARC typically being a bit of a gzip-Matryoshka. --- .../fetcher/warc/WarcDigestBuilder.java | 4 ++++ .../retreival/fetcher/warc/WarcRecorder.java | 16 +++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java index 6fd020b4..69381c2a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java @@ -19,6 +19,10 @@ class WarcDigestBuilder { update(bytes, bytes.length); } + public void update(byte[] bytes) { + update(bytes, bytes.length); + } + public void update(byte[] buffer, int n) { update(buffer, 0, n); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index 6326104a..b9d44310 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -17,6 +17,7 @@ import java.io.InputStream; import java.net.InetAddress; import java.net.URI; import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.security.NoSuchAlgorithmException; @@ -91,15 +92,15 @@ public class WarcRecorder implements AutoCloseable { try (var response = call.execute(); WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response)) { - String responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()); + byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8); - ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length()); + ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length); InputStream inputStream = inputBuffer.read(); ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response); responseDataBuffer.put(responseHeaders); - responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseDataBuffer.length()); + responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length); int dataStart = responseDataBuffer.pos(); @@ -201,8 +202,10 @@ public class WarcRecorder implements AutoCloseable { fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}"); } - String header = WarcProtocolReconstructor.getResponseHeader(fakeHeadersBuilder.toString(), statusCode); - ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length()); + byte[] header = WarcProtocolReconstructor + .getResponseHeader(fakeHeadersBuilder.toString(), statusCode) + .getBytes(StandardCharsets.UTF_8); + ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length); responseDataBuffer.put(header); responseDigestBuilder.update(header); @@ -335,8 +338,7 @@ public class WarcRecorder implements AutoCloseable { return length; } - public void put(String s) { - byte[] bytes = s.getBytes(); + public void put(byte[] bytes) { put(bytes, 0, bytes.length); }