(warc) Improve WARC standard adherence
The WARC specification says the records should transparently remove compression. This was not done, leading to the WARC typically being a bit of a gzip-Matryoshka.
This commit is contained in:
parent
8340aa2b6c
commit
929caed0b9
@ -19,6 +19,10 @@ class WarcDigestBuilder {
|
|||||||
update(bytes, bytes.length);
|
update(bytes, bytes.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void update(byte[] bytes) {
|
||||||
|
update(bytes, bytes.length);
|
||||||
|
}
|
||||||
|
|
||||||
public void update(byte[] buffer, int n) {
|
public void update(byte[] buffer, int n) {
|
||||||
update(buffer, 0, n);
|
update(buffer, 0, n);
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,7 @@ import java.io.InputStream;
|
|||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
@ -91,15 +92,15 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
try (var response = call.execute();
|
try (var response = call.execute();
|
||||||
WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
|
WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
|
||||||
{
|
{
|
||||||
String responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size());
|
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||||
|
|
||||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length());
|
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||||
InputStream inputStream = inputBuffer.read();
|
InputStream inputStream = inputBuffer.read();
|
||||||
|
|
||||||
ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response);
|
ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response);
|
||||||
|
|
||||||
responseDataBuffer.put(responseHeaders);
|
responseDataBuffer.put(responseHeaders);
|
||||||
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseDataBuffer.length());
|
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
||||||
|
|
||||||
int dataStart = responseDataBuffer.pos();
|
int dataStart = responseDataBuffer.pos();
|
||||||
|
|
||||||
@ -201,8 +202,10 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}");
|
fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}");
|
||||||
}
|
}
|
||||||
|
|
||||||
String header = WarcProtocolReconstructor.getResponseHeader(fakeHeadersBuilder.toString(), statusCode);
|
byte[] header = WarcProtocolReconstructor
|
||||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length());
|
.getResponseHeader(fakeHeadersBuilder.toString(), statusCode)
|
||||||
|
.getBytes(StandardCharsets.UTF_8);
|
||||||
|
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length);
|
||||||
responseDataBuffer.put(header);
|
responseDataBuffer.put(header);
|
||||||
|
|
||||||
responseDigestBuilder.update(header);
|
responseDigestBuilder.update(header);
|
||||||
@ -335,8 +338,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
return length;
|
return length;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void put(String s) {
|
public void put(byte[] bytes) {
|
||||||
byte[] bytes = s.getBytes();
|
|
||||||
put(bytes, 0, bytes.length);
|
put(bytes, 0, bytes.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user