(converter) Read cookie information

Add an optional new field to CrawledDocument containing information about whether the domain has cookies.  This was previously on the CrawledDomain object, but since the WarcFormat requires us to write a WarcInfo object at the start of a crawl rather than at the end, this information is unobtainable when creating the CrawledDomain object.

Also fix a bug in the deduplication logic in the DomainProcessor class that caused a test to break.
This commit is contained in:
Viktor Lofgren 2023-12-15 18:09:53 +01:00
parent fa81e5b8ee
commit cf935a5331
8 changed files with 33 additions and 8 deletions

View File

@ -101,7 +101,8 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
"",
nextRecord.url,
null,
""));
"",
nextRecord.cookies));
}
public void close() throws IOException {

View File

@ -69,7 +69,6 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
redirectDomain = statusReason;
}
// TODO: Fix cookies info somehow
next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip,
new ArrayList<>(),
new ArrayList<>()
@ -98,7 +97,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
response.payloadDigest().map(WarcDigest::base64).orElse(""),
"",
"",
"");
"",
WarcXCookieInformationHeader.hasCookies(response)
);
} else if (parsedBody instanceof DocumentBodyResult.Ok<String> ok) {
next = new CrawledDocument(
"",
@ -113,7 +114,8 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
response.payloadDigest().map(WarcDigest::base64).orElse(""),
"",
"",
"");
"",
WarcXCookieInformationHeader.hasCookies(response));
} else {
// unreachable
throw new IllegalStateException("Unknown body type: " + parsedBody);

View File

@ -30,6 +30,10 @@ public class CrawledDocument implements SerializableCrawlData {
public String recrawlState;
/** This is not guaranteed to be set in all versions of the format,
* information may come in CrawledDomain instead */
public Boolean hasCookies = false;
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
@Override
public String getSerialIdentifier() {

View File

@ -17,6 +17,9 @@ public class CrawledDomain implements SerializableCrawlData {
public String ip;
public List<CrawledDocument> doc;
/** This is not guaranteed to be set in all versions of the format,
* information may come in CrawledDocument instead */
public List<String> cookies;
public int size() {

View File

@ -97,11 +97,15 @@ public class DomainProcessor {
}
else if (data instanceof CrawledDocument doc) {
try {
if (doc.url == null || processedUrls.add(doc.url))
if (doc.url == null || !processedUrls.add(doc.url))
continue;
fixBadCanonicalTag(doc);
if (Boolean.TRUE.equals(doc.hasCookies)) {
cookies = true;
}
// This case should never be reachable, as we should have initiated
// the externalDomainLinks variable above if we made it past the
// doc.url == null check; but we'll leave it here just in case

View File

@ -50,7 +50,8 @@ public class SideloaderProcessing {
Integer.toHexString(url.hashCode()),
url,
"",
"SIDELOAD"
"SIDELOAD",
false
);
var ret = new ProcessedDocument();

View File

@ -65,6 +65,7 @@ public class ConvertingIntegrationTest {
@Test
public void testMemexMarginaliaNu() throws IOException {
var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet()));
assertNotNull(ret);
assertEquals(ret.state, DomainIndexingState.ACTIVE);
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
@ -114,7 +115,8 @@ public class ConvertingIntegrationTest {
Double.toString(Math.random()),
"https://memex.marginalia.nu/" + file,
null,
""
"",
false
);
docs.add(doc);
}

View File

@ -11,10 +11,13 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
import org.junit.jupiter.api.*;
@ -31,6 +34,7 @@ public class CrawlingThenConvertingIntegrationTest {
private HttpFetcher httpFetcher;
private Path fileName;
private Path fileName2;
@SneakyThrows
@BeforeAll
@ -49,11 +53,13 @@ public class CrawlingThenConvertingIntegrationTest {
domainProcessor = injector.getInstance(DomainProcessor.class);
httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz");
this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz");
}
@AfterEach
public void tearDown() throws IOException {
Files.deleteIfExists(fileName);
Files.deleteIfExists(fileName2);
}
@Test
@ -90,7 +96,9 @@ public class CrawlingThenConvertingIntegrationTest {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
}
try (var reader = new WarcSerializableCrawlDataStream(fileName)) {
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, fileName, fileName2);
try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) {
while (reader.hasNext()) {
data.add(reader.next());
}