(warc) Improve WARC standard adherence

The WARC specification says the records should transparently remove compression.  This was not done, leading to the WARC typically being a bit of a gzip-Matryoshka.
This commit is contained in:
Viktor Lofgren 2024-02-09 20:07:01 +01:00
parent 8340aa2b6c
commit 929caed0b9
2 changed files with 13 additions and 7 deletions

View File

@ -19,6 +19,10 @@ class WarcDigestBuilder {
update(bytes, bytes.length); update(bytes, bytes.length);
} }
public void update(byte[] bytes) {
update(bytes, bytes.length);
}
public void update(byte[] buffer, int n) { public void update(byte[] buffer, int n) {
update(buffer, 0, n); update(buffer, 0, n);
} }

View File

@ -17,6 +17,7 @@ import java.io.InputStream;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
@ -91,15 +92,15 @@ public class WarcRecorder implements AutoCloseable {
try (var response = call.execute(); try (var response = call.execute();
WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response)) WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
{ {
String responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()); byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length()); ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
InputStream inputStream = inputBuffer.read(); InputStream inputStream = inputBuffer.read();
ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response); ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response);
responseDataBuffer.put(responseHeaders); responseDataBuffer.put(responseHeaders);
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseDataBuffer.length()); responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
int dataStart = responseDataBuffer.pos(); int dataStart = responseDataBuffer.pos();
@ -201,8 +202,10 @@ public class WarcRecorder implements AutoCloseable {
fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}"); fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}");
} }
String header = WarcProtocolReconstructor.getResponseHeader(fakeHeadersBuilder.toString(), statusCode); byte[] header = WarcProtocolReconstructor
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length()); .getResponseHeader(fakeHeadersBuilder.toString(), statusCode)
.getBytes(StandardCharsets.UTF_8);
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length);
responseDataBuffer.put(header); responseDataBuffer.put(header);
responseDigestBuilder.update(header); responseDigestBuilder.update(header);
@ -335,8 +338,7 @@ public class WarcRecorder implements AutoCloseable {
return length; return length;
} }
public void put(String s) { public void put(byte[] bytes) {
byte[] bytes = s.getBytes();
put(bytes, 0, bytes.length); put(bytes, 0, bytes.length);
} }