(warc) Add a fields for etags and last-modified headers to the new crawl data formats

Make some temporary modifications to the CrawledDocument model to support both a "big string" style headers field like in the old formats, and explicit fields as in the new formats.  This is a bit awkward to deal with, but it's a necessity until we migrate off the old formats entirely.

The commit also adds a few tests to this logic.
This commit is contained in:
Viktor Lofgren 2023-12-18 17:45:54 +01:00
parent 126ac3816f
commit 3a56a06c4f
12 changed files with 294 additions and 29 deletions

View File

@ -118,7 +118,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
nextRecord.url,
null,
"",
nextRecord.cookies));
nextRecord.cookies,
nextRecord.lastModifiedHeader,
nextRecord.etagHeader));
}
public void close() throws IOException {

View File

@ -82,6 +82,8 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
return;
}
var httpHeaders = http.headers();
var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response));
if (parsedBody instanceof DocumentBodyResult.Error<String> error) {
next = new CrawledDocument(
@ -98,7 +100,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
"",
"",
"",
WarcXCookieInformationHeader.hasCookies(response)
WarcXCookieInformationHeader.hasCookies(response),
null,
null
);
} else if (parsedBody instanceof DocumentBodyResult.Ok<String> ok) {
next = new CrawledDocument(
@ -115,7 +119,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
"",
"",
"",
WarcXCookieInformationHeader.hasCookies(response));
WarcXCookieInformationHeader.hasCookies(response),
httpHeaders.first("Last-Modified").orElse(""),
httpHeaders.first("ETag").orElse(""));
} else {
// unreachable
throw new IllegalStateException("Unknown body type: " + parsedBody);

View File

@ -5,6 +5,8 @@ import lombok.Builder;
import lombok.ToString;
import nu.marginalia.bigstring.BigString;
import nu.marginalia.model.EdgeUrl;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
@Builder
@AllArgsConstructor
@ -21,7 +23,10 @@ public class CrawledDocument implements SerializableCrawlData {
public String crawlerStatus;
public String crawlerStatusDesc;
@Nullable
@Deprecated // use getETag() or getLastModified() instead
public String headers;
public String documentBody;
@Deprecated
@ -38,6 +43,51 @@ public class CrawledDocument implements SerializableCrawlData {
* information may come in CrawledDomain instead */
public Boolean hasCookies = false;
public String lastModifiedMaybe;
public String etagMaybe;
@Nullable
private String getHeader(String header) {
if (headers == null) {
return null;
}
String headerString = header + ":";
String[] headersLines = StringUtils.split(headers, '\n');
for (String headerLine : headersLines) {
if (StringUtils.startsWithIgnoreCase(headerLine, headerString)) {
return headerLine.substring(headerString.length()).trim();
}
}
return null;
}
/** Returns the ETag header, or null if not present;
* <p>
* this is a compatibility shim between the old json format, which saves headers in a long string
* and the new parquet format which saves only the ETag and Last-Modified headers in separate columns
* */
public String getEtag() {
if (etagMaybe != null) {
return etagMaybe;
}
return getHeader("ETag");
}
/** Returns the Last-Modified header, or null if not present
* <p>
* this is a compatibility shim between the old json format, which saves headers in a long string
* * and the new parquet format which saves only the ETag and Last-Modified headers in separate columns
* */
public String getLastModified() {
if (lastModifiedMaybe != null) {
return lastModifiedMaybe;
}
return getHeader("Last-Modified");
}
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
@Override
public String getSerialIdentifier() {

View File

@ -29,6 +29,9 @@ public class CrawledDocumentParquetRecord {
public String contentType;
public byte[] body;
public String etagHeader;
public String lastModifiedHeader;
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
return new CrawledDocumentParquetRecordHydrator();
}
@ -46,7 +49,9 @@ public class CrawledDocumentParquetRecord {
Types.required(INT32).named("httpStatus"),
Types.required(INT64).named("epochSeconds"),
Types.required(BINARY).as(stringType()).named("contentType"),
Types.required(BINARY).named("body")
Types.required(BINARY).named("body"),
Types.optional(BINARY).as(stringType()).named("etagHeader"),
Types.optional(BINARY).as(stringType()).named("lastModifiedHeader")
);
@ -60,6 +65,9 @@ public class CrawledDocumentParquetRecord {
case "contentType" -> contentType = (String) value;
case "body" -> body = (byte[]) value;
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
case "etagHeader" -> etagHeader = (String) value;
case "lastModifiedHeader" -> lastModifiedHeader = (String) value;
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
}
return this;
@ -74,6 +82,12 @@ public class CrawledDocumentParquetRecord {
valueWriter.write("cookies", cookies);
valueWriter.write("contentType", contentType);
valueWriter.write("body", body);
if (etagHeader != null) {
valueWriter.write("etagHeader", etagHeader);
}
if (lastModifiedHeader != null) {
valueWriter.write("lastModifiedHeader", lastModifiedHeader);
}
}
}

View File

@ -131,11 +131,15 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
return;
}
byte[] bodyBytes;
String contentType;
var body = DocumentBodyExtractor.asBytes(result);
var headers = fetchOk.headers();
if (body instanceof DocumentBodyResult.Ok<byte[]> bodyOk) {
bodyBytes = bodyOk.body();
contentType = bodyOk.contentType().toString();
@ -153,7 +157,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
fetchOk.statusCode(),
response.date(),
contentType,
bodyBytes)
bodyBytes,
headers.get("ETag"),
headers.get("Last-Modified"))
);
}
@ -170,7 +176,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
0,
date,
"x-marginalia/advisory;state=redirect",
new byte[0]
new byte[0],
null,
null
);
}
private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) {
@ -181,7 +189,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
0,
date,
"x-marginalia/advisory;state=error",
errorStatus.getBytes()
errorStatus.getBytes(),
null,
null
);
}
@ -193,7 +203,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
0,
date,
errorStatus,
new byte[0]
new byte[0],
null,
null
);
}

View File

@ -0,0 +1,101 @@
package nu.marginalia.crawling.model;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class CrawledDocumentTest {
/** These tests are AI-generated hence have kinda inconsistent naming */
@Test
void getEtagShouldReturnEtagIfPresent() {
CrawledDocument crawledDocument = CrawledDocument.builder()
.etagMaybe("12345")
.build();
// Etag is present, method should return it.
String etag = crawledDocument.getEtag();
assertEquals("12345", etag);
}
@Test
void getEtagShouldReturnNullIfEtagIsAbsentAndHeadersAreNull() {
CrawledDocument crawledDocument = CrawledDocument.builder()
.etagMaybe(null)
.headers(null)
.build();
// Etag and headers are absent, method should return null.
String etag = crawledDocument.getEtag();
assertNull(etag);
}
@Test
void getEtagShouldReturnNullIfEtagIsAbsentAndHeadersDoNotContainEtag() {
CrawledDocument crawledDocument = CrawledDocument.builder()
.etagMaybe(null)
.headers("Some irrelevant headers")
.build();
// Headers do not contain an ETag, method should return null.
String etag = crawledDocument.getEtag();
assertNull(etag);
}
@Test
void getEtagShouldReturnEtagFromHeadersIfPresent() {
CrawledDocument crawledDocument = CrawledDocument.builder()
.etagMaybe(null)
.headers("ETag: 67890")
.build();
// Headers contain an ETag, method should return it.
String etag = crawledDocument.getEtag();
assertEquals("67890", etag);
}
@Test
public void testGetLastModified_withLastModifiedDateInHeaders() {
// Arrange
String lastModifiedDate = "Wed, 21 Oct 2015 07:28:00 GMT";
CrawledDocument crawledDocument = CrawledDocument.builder()
.headers("Last-Modified: " + lastModifiedDate)
.build();
// Act
String actualLastModifiedDate = crawledDocument.getLastModified();
// Assert
assertEquals(lastModifiedDate, actualLastModifiedDate);
}
@Test
public void testGetLastModified_withoutLastModifiedDateInHeaders() {
// Arrange
CrawledDocument crawledDocument = CrawledDocument.builder()
.headers("Some-Other-Header: Some value")
.build();
// Act
String actualLastModifiedDate = crawledDocument.getLastModified();
// Assert
assertNull(actualLastModifiedDate);
}
@Test
public void testGetLastModified_withLastModifiedDateInField() {
// Arrange
String lastModifiedDate = "Wed, 21 Oct 2015 07:28:00 GMT";
CrawledDocument crawledDocument = CrawledDocument.builder()
.lastModifiedMaybe(lastModifiedDate)
.build();
// Act
String actualLastModifiedDate = crawledDocument.getLastModified();
// Assert
assertEquals(lastModifiedDate, actualLastModifiedDate);
}
}

View File

@ -38,7 +38,8 @@ class CrawledDocumentParquetRecordFileWriterTest {
200,
Instant.now(),
"text/html",
"hello world".getBytes());
"hello world".getBytes(),
null, null);
try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) {
writer.write(original);

View File

@ -51,7 +51,9 @@ public class SideloaderProcessing {
url,
"",
"SIDELOAD",
false
false,
null,
null
);
var ret = new ProcessedDocument();

View File

@ -116,7 +116,9 @@ public class ConvertingIntegrationTest {
"https://memex.marginalia.nu/" + file,
null,
"",
false
false,
null,
null
);
docs.add(doc);
}

View File

@ -49,22 +49,11 @@ public record DocumentWithReference(
if (null == doc)
return ContentTags.empty();
String headers = doc.headers;
if (headers == null)
String lastmod = doc.getLastModified();
String etag = doc.getEtag();
if (lastmod == null && etag == null) {
return ContentTags.empty();
String[] headersLines = headers.split("\n");
String lastmod = null;
String etag = null;
for (String line : headersLines) {
if (line.toLowerCase().startsWith("etag:")) {
etag = line.substring(5).trim();
}
if (line.toLowerCase().startsWith("last-modified:")) {
lastmod = line.substring(14).trim();
}
}
return new ContentTags(etag, lastmod);

View File

@ -0,0 +1,86 @@
package nu.marginalia.crawl.retreival.revisit;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawling.model.CrawledDocument;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
public class DocumentWithReferenceTest {
// test case for when doc is null
@Test
public void getContentTags_docIsNull() {
// set up test data
CrawledDocument doc = null;
CrawlDataReference reference = new CrawlDataReference();
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
// execute method under test
ContentTags contentTags = documentWithReference.getContentTags();
// verify that returned content tags is empty
assertTrue(contentTags.isEmpty());
}
// test case for when doc is not null, and lastModified and eTag are null
@Test
public void getContentTags_lastModifiedAndETagIsNull() {
// set up test data
CrawledDocument doc = CrawledDocument.builder().build(); // both lastModified and eTag are null
CrawlDataReference reference = new CrawlDataReference();
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
// execute method under test
ContentTags contentTags = documentWithReference.getContentTags();
// verify that returned content tags is empty
assertTrue(contentTags.isEmpty());
}
// test case for when doc is not null, and lastModified and eTag are not null
@Test
public void getContentTags_lastModifiedAndETagAreNotNull_NewCrawlData() {
// set up test data
CrawledDocument doc = CrawledDocument.builder()
.etagMaybe("12345")
.lastModifiedMaybe("67890")
.build(); // assume lastModified and eTag are not null
CrawlDataReference reference = new CrawlDataReference();
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
// execute method under test
ContentTags contentTags = documentWithReference.getContentTags();
// verify that returned content tags is present
assertFalse(contentTags.isEmpty());
assertEquals("12345", contentTags.etag());
assertEquals("67890", contentTags.lastMod());
}
@Test
public void getContentTags_lastModifiedAndETagAreNotNull_LegacyCrawlData() {
// set up test data
CrawledDocument doc = CrawledDocument.builder()
.headers("""
Etag: 12345
Last-Modified: 67890
""")
.build(); // assume lastModified and eTag are not null
CrawlDataReference reference = new CrawlDataReference();
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
// execute method under test
ContentTags contentTags = documentWithReference.getContentTags();
// verify that returned content tags is present
assertFalse(contentTags.isEmpty());
assertEquals("12345", contentTags.etag());
assertEquals("67890", contentTags.lastMod());
}
}

View File

@ -234,6 +234,8 @@ class CrawlerRetreiverTest {
}
var stream = CrawledDomainReader.createDataStream(tempFile);
System.out.println("---");
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
try (var recorder = new WarcRecorder(tempFile2)) {
@ -244,8 +246,6 @@ class CrawlerRetreiverTest {
Assertions.fail(ex);
}
new GZIPInputStream(Files.newInputStream(tempFile2)).transferTo(System.out);
try (var reader = new WarcReader(tempFile2)) {
WarcXResponseReference.register(reader);
@ -270,7 +270,7 @@ class CrawlerRetreiverTest {
System.out.println(dr.domain + "/" + dr.crawlerStatus);
}
else if (doc instanceof CrawledDocument dc) {
System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus);
System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus + "/" + dc.timestamp);
}
}
} catch (Exception e) {