(warc) Add a fields for etags and last-modified headers to the new crawl data formats
Make some temporary modifications to the CrawledDocument model to support both a "big string" style headers field like in the old formats, and explicit fields as in the new formats. This is a bit awkward to deal with, but it's a necessity until we migrate off the old formats entirely. The commit also adds a few tests to this logic.
This commit is contained in:
parent
126ac3816f
commit
3a56a06c4f
@ -118,7 +118,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
nextRecord.url,
|
||||
null,
|
||||
"",
|
||||
nextRecord.cookies));
|
||||
nextRecord.cookies,
|
||||
nextRecord.lastModifiedHeader,
|
||||
nextRecord.etagHeader));
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
|
@ -82,6 +82,8 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
return;
|
||||
}
|
||||
|
||||
var httpHeaders = http.headers();
|
||||
|
||||
var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response));
|
||||
if (parsedBody instanceof DocumentBodyResult.Error<String> error) {
|
||||
next = new CrawledDocument(
|
||||
@ -98,7 +100,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
WarcXCookieInformationHeader.hasCookies(response)
|
||||
WarcXCookieInformationHeader.hasCookies(response),
|
||||
null,
|
||||
null
|
||||
);
|
||||
} else if (parsedBody instanceof DocumentBodyResult.Ok<String> ok) {
|
||||
next = new CrawledDocument(
|
||||
@ -115,7 +119,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
WarcXCookieInformationHeader.hasCookies(response));
|
||||
WarcXCookieInformationHeader.hasCookies(response),
|
||||
httpHeaders.first("Last-Modified").orElse(""),
|
||||
httpHeaders.first("ETag").orElse(""));
|
||||
} else {
|
||||
// unreachable
|
||||
throw new IllegalStateException("Unknown body type: " + parsedBody);
|
||||
|
@ -5,6 +5,8 @@ import lombok.Builder;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.bigstring.BigString;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@ -21,7 +23,10 @@ public class CrawledDocument implements SerializableCrawlData {
|
||||
public String crawlerStatus;
|
||||
public String crawlerStatusDesc;
|
||||
|
||||
@Nullable
|
||||
@Deprecated // use getETag() or getLastModified() instead
|
||||
public String headers;
|
||||
|
||||
public String documentBody;
|
||||
|
||||
@Deprecated
|
||||
@ -38,6 +43,51 @@ public class CrawledDocument implements SerializableCrawlData {
|
||||
* information may come in CrawledDomain instead */
|
||||
public Boolean hasCookies = false;
|
||||
|
||||
public String lastModifiedMaybe;
|
||||
public String etagMaybe;
|
||||
|
||||
@Nullable
|
||||
private String getHeader(String header) {
|
||||
if (headers == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String headerString = header + ":";
|
||||
|
||||
String[] headersLines = StringUtils.split(headers, '\n');
|
||||
for (String headerLine : headersLines) {
|
||||
if (StringUtils.startsWithIgnoreCase(headerLine, headerString)) {
|
||||
return headerLine.substring(headerString.length()).trim();
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns the ETag header, or null if not present;
|
||||
* <p>
|
||||
* this is a compatibility shim between the old json format, which saves headers in a long string
|
||||
* and the new parquet format which saves only the ETag and Last-Modified headers in separate columns
|
||||
* */
|
||||
public String getEtag() {
|
||||
if (etagMaybe != null) {
|
||||
return etagMaybe;
|
||||
}
|
||||
return getHeader("ETag");
|
||||
}
|
||||
|
||||
/** Returns the Last-Modified header, or null if not present
|
||||
* <p>
|
||||
* this is a compatibility shim between the old json format, which saves headers in a long string
|
||||
* * and the new parquet format which saves only the ETag and Last-Modified headers in separate columns
|
||||
* */
|
||||
public String getLastModified() {
|
||||
if (lastModifiedMaybe != null) {
|
||||
return lastModifiedMaybe;
|
||||
}
|
||||
return getHeader("Last-Modified");
|
||||
}
|
||||
|
||||
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
|
||||
@Override
|
||||
public String getSerialIdentifier() {
|
||||
|
@ -29,6 +29,9 @@ public class CrawledDocumentParquetRecord {
|
||||
public String contentType;
|
||||
public byte[] body;
|
||||
|
||||
public String etagHeader;
|
||||
public String lastModifiedHeader;
|
||||
|
||||
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
|
||||
return new CrawledDocumentParquetRecordHydrator();
|
||||
}
|
||||
@ -46,7 +49,9 @@ public class CrawledDocumentParquetRecord {
|
||||
Types.required(INT32).named("httpStatus"),
|
||||
Types.required(INT64).named("epochSeconds"),
|
||||
Types.required(BINARY).as(stringType()).named("contentType"),
|
||||
Types.required(BINARY).named("body")
|
||||
Types.required(BINARY).named("body"),
|
||||
Types.optional(BINARY).as(stringType()).named("etagHeader"),
|
||||
Types.optional(BINARY).as(stringType()).named("lastModifiedHeader")
|
||||
);
|
||||
|
||||
|
||||
@ -60,6 +65,9 @@ public class CrawledDocumentParquetRecord {
|
||||
case "contentType" -> contentType = (String) value;
|
||||
case "body" -> body = (byte[]) value;
|
||||
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
|
||||
case "etagHeader" -> etagHeader = (String) value;
|
||||
case "lastModifiedHeader" -> lastModifiedHeader = (String) value;
|
||||
|
||||
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
|
||||
}
|
||||
return this;
|
||||
@ -74,6 +82,12 @@ public class CrawledDocumentParquetRecord {
|
||||
valueWriter.write("cookies", cookies);
|
||||
valueWriter.write("contentType", contentType);
|
||||
valueWriter.write("body", body);
|
||||
if (etagHeader != null) {
|
||||
valueWriter.write("etagHeader", etagHeader);
|
||||
}
|
||||
if (lastModifiedHeader != null) {
|
||||
valueWriter.write("lastModifiedHeader", lastModifiedHeader);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -131,11 +131,15 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
byte[] bodyBytes;
|
||||
String contentType;
|
||||
|
||||
var body = DocumentBodyExtractor.asBytes(result);
|
||||
|
||||
var headers = fetchOk.headers();
|
||||
|
||||
if (body instanceof DocumentBodyResult.Ok<byte[]> bodyOk) {
|
||||
bodyBytes = bodyOk.body();
|
||||
contentType = bodyOk.contentType().toString();
|
||||
@ -153,7 +157,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
fetchOk.statusCode(),
|
||||
response.date(),
|
||||
contentType,
|
||||
bodyBytes)
|
||||
bodyBytes,
|
||||
headers.get("ETag"),
|
||||
headers.get("Last-Modified"))
|
||||
);
|
||||
}
|
||||
|
||||
@ -170,7 +176,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
0,
|
||||
date,
|
||||
"x-marginalia/advisory;state=redirect",
|
||||
new byte[0]
|
||||
new byte[0],
|
||||
null,
|
||||
null
|
||||
);
|
||||
}
|
||||
private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) {
|
||||
@ -181,7 +189,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
0,
|
||||
date,
|
||||
"x-marginalia/advisory;state=error",
|
||||
errorStatus.getBytes()
|
||||
errorStatus.getBytes(),
|
||||
null,
|
||||
null
|
||||
);
|
||||
}
|
||||
|
||||
@ -193,7 +203,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
0,
|
||||
date,
|
||||
errorStatus,
|
||||
new byte[0]
|
||||
new byte[0],
|
||||
null,
|
||||
null
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,101 @@
|
||||
package nu.marginalia.crawling.model;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class CrawledDocumentTest {
|
||||
|
||||
/** These tests are AI-generated hence have kinda inconsistent naming */
|
||||
|
||||
@Test
|
||||
void getEtagShouldReturnEtagIfPresent() {
|
||||
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||
.etagMaybe("12345")
|
||||
.build();
|
||||
|
||||
// Etag is present, method should return it.
|
||||
String etag = crawledDocument.getEtag();
|
||||
assertEquals("12345", etag);
|
||||
}
|
||||
|
||||
@Test
|
||||
void getEtagShouldReturnNullIfEtagIsAbsentAndHeadersAreNull() {
|
||||
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||
.etagMaybe(null)
|
||||
.headers(null)
|
||||
.build();
|
||||
|
||||
// Etag and headers are absent, method should return null.
|
||||
String etag = crawledDocument.getEtag();
|
||||
assertNull(etag);
|
||||
}
|
||||
|
||||
@Test
|
||||
void getEtagShouldReturnNullIfEtagIsAbsentAndHeadersDoNotContainEtag() {
|
||||
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||
.etagMaybe(null)
|
||||
.headers("Some irrelevant headers")
|
||||
.build();
|
||||
|
||||
// Headers do not contain an ETag, method should return null.
|
||||
String etag = crawledDocument.getEtag();
|
||||
assertNull(etag);
|
||||
}
|
||||
|
||||
@Test
|
||||
void getEtagShouldReturnEtagFromHeadersIfPresent() {
|
||||
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||
.etagMaybe(null)
|
||||
.headers("ETag: 67890")
|
||||
.build();
|
||||
|
||||
// Headers contain an ETag, method should return it.
|
||||
String etag = crawledDocument.getEtag();
|
||||
assertEquals("67890", etag);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetLastModified_withLastModifiedDateInHeaders() {
|
||||
// Arrange
|
||||
String lastModifiedDate = "Wed, 21 Oct 2015 07:28:00 GMT";
|
||||
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||
.headers("Last-Modified: " + lastModifiedDate)
|
||||
.build();
|
||||
|
||||
// Act
|
||||
String actualLastModifiedDate = crawledDocument.getLastModified();
|
||||
|
||||
// Assert
|
||||
assertEquals(lastModifiedDate, actualLastModifiedDate);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetLastModified_withoutLastModifiedDateInHeaders() {
|
||||
// Arrange
|
||||
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||
.headers("Some-Other-Header: Some value")
|
||||
.build();
|
||||
|
||||
// Act
|
||||
String actualLastModifiedDate = crawledDocument.getLastModified();
|
||||
|
||||
// Assert
|
||||
assertNull(actualLastModifiedDate);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetLastModified_withLastModifiedDateInField() {
|
||||
// Arrange
|
||||
String lastModifiedDate = "Wed, 21 Oct 2015 07:28:00 GMT";
|
||||
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||
.lastModifiedMaybe(lastModifiedDate)
|
||||
.build();
|
||||
|
||||
// Act
|
||||
String actualLastModifiedDate = crawledDocument.getLastModified();
|
||||
|
||||
// Assert
|
||||
assertEquals(lastModifiedDate, actualLastModifiedDate);
|
||||
}
|
||||
}
|
@ -38,7 +38,8 @@ class CrawledDocumentParquetRecordFileWriterTest {
|
||||
200,
|
||||
Instant.now(),
|
||||
"text/html",
|
||||
"hello world".getBytes());
|
||||
"hello world".getBytes(),
|
||||
null, null);
|
||||
|
||||
try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) {
|
||||
writer.write(original);
|
||||
|
@ -51,7 +51,9 @@ public class SideloaderProcessing {
|
||||
url,
|
||||
"",
|
||||
"SIDELOAD",
|
||||
false
|
||||
false,
|
||||
null,
|
||||
null
|
||||
);
|
||||
|
||||
var ret = new ProcessedDocument();
|
||||
|
@ -116,7 +116,9 @@ public class ConvertingIntegrationTest {
|
||||
"https://memex.marginalia.nu/" + file,
|
||||
null,
|
||||
"",
|
||||
false
|
||||
false,
|
||||
null,
|
||||
null
|
||||
);
|
||||
docs.add(doc);
|
||||
}
|
||||
|
@ -49,22 +49,11 @@ public record DocumentWithReference(
|
||||
if (null == doc)
|
||||
return ContentTags.empty();
|
||||
|
||||
String headers = doc.headers;
|
||||
if (headers == null)
|
||||
String lastmod = doc.getLastModified();
|
||||
String etag = doc.getEtag();
|
||||
|
||||
if (lastmod == null && etag == null) {
|
||||
return ContentTags.empty();
|
||||
|
||||
String[] headersLines = headers.split("\n");
|
||||
|
||||
String lastmod = null;
|
||||
String etag = null;
|
||||
|
||||
for (String line : headersLines) {
|
||||
if (line.toLowerCase().startsWith("etag:")) {
|
||||
etag = line.substring(5).trim();
|
||||
}
|
||||
if (line.toLowerCase().startsWith("last-modified:")) {
|
||||
lastmod = line.substring(14).trim();
|
||||
}
|
||||
}
|
||||
|
||||
return new ContentTags(etag, lastmod);
|
||||
|
@ -0,0 +1,86 @@
|
||||
package nu.marginalia.crawl.retreival.revisit;
|
||||
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
public class DocumentWithReferenceTest {
|
||||
|
||||
// test case for when doc is null
|
||||
@Test
|
||||
public void getContentTags_docIsNull() {
|
||||
// set up test data
|
||||
CrawledDocument doc = null;
|
||||
CrawlDataReference reference = new CrawlDataReference();
|
||||
|
||||
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
|
||||
|
||||
// execute method under test
|
||||
ContentTags contentTags = documentWithReference.getContentTags();
|
||||
|
||||
// verify that returned content tags is empty
|
||||
assertTrue(contentTags.isEmpty());
|
||||
}
|
||||
|
||||
// test case for when doc is not null, and lastModified and eTag are null
|
||||
@Test
|
||||
public void getContentTags_lastModifiedAndETagIsNull() {
|
||||
// set up test data
|
||||
CrawledDocument doc = CrawledDocument.builder().build(); // both lastModified and eTag are null
|
||||
CrawlDataReference reference = new CrawlDataReference();
|
||||
|
||||
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
|
||||
|
||||
// execute method under test
|
||||
ContentTags contentTags = documentWithReference.getContentTags();
|
||||
|
||||
// verify that returned content tags is empty
|
||||
assertTrue(contentTags.isEmpty());
|
||||
}
|
||||
|
||||
// test case for when doc is not null, and lastModified and eTag are not null
|
||||
@Test
|
||||
public void getContentTags_lastModifiedAndETagAreNotNull_NewCrawlData() {
|
||||
// set up test data
|
||||
CrawledDocument doc = CrawledDocument.builder()
|
||||
.etagMaybe("12345")
|
||||
.lastModifiedMaybe("67890")
|
||||
.build(); // assume lastModified and eTag are not null
|
||||
CrawlDataReference reference = new CrawlDataReference();
|
||||
|
||||
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
|
||||
|
||||
// execute method under test
|
||||
ContentTags contentTags = documentWithReference.getContentTags();
|
||||
|
||||
// verify that returned content tags is present
|
||||
assertFalse(contentTags.isEmpty());
|
||||
assertEquals("12345", contentTags.etag());
|
||||
assertEquals("67890", contentTags.lastMod());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void getContentTags_lastModifiedAndETagAreNotNull_LegacyCrawlData() {
|
||||
// set up test data
|
||||
CrawledDocument doc = CrawledDocument.builder()
|
||||
.headers("""
|
||||
Etag: 12345
|
||||
Last-Modified: 67890
|
||||
""")
|
||||
.build(); // assume lastModified and eTag are not null
|
||||
CrawlDataReference reference = new CrawlDataReference();
|
||||
|
||||
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
|
||||
|
||||
// execute method under test
|
||||
ContentTags contentTags = documentWithReference.getContentTags();
|
||||
|
||||
// verify that returned content tags is present
|
||||
assertFalse(contentTags.isEmpty());
|
||||
assertEquals("12345", contentTags.etag());
|
||||
assertEquals("67890", contentTags.lastMod());
|
||||
}
|
||||
}
|
@ -234,6 +234,8 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
var stream = CrawledDomainReader.createDataStream(tempFile);
|
||||
|
||||
System.out.println("---");
|
||||
|
||||
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
||||
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
||||
try (var recorder = new WarcRecorder(tempFile2)) {
|
||||
@ -244,8 +246,6 @@ class CrawlerRetreiverTest {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
|
||||
new GZIPInputStream(Files.newInputStream(tempFile2)).transferTo(System.out);
|
||||
|
||||
try (var reader = new WarcReader(tempFile2)) {
|
||||
WarcXResponseReference.register(reader);
|
||||
|
||||
@ -270,7 +270,7 @@ class CrawlerRetreiverTest {
|
||||
System.out.println(dr.domain + "/" + dr.crawlerStatus);
|
||||
}
|
||||
else if (doc instanceof CrawledDocument dc) {
|
||||
System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus);
|
||||
System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus + "/" + dc.timestamp);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
|
Loading…
Reference in New Issue
Block a user