(converter) Read cookie information
Add an optional new field to CrawledDocument containing information about whether the domain has cookies. This was previously on the CrawledDomain object, but since the WarcFormat requires us to write a WarcInfo object at the start of a crawl rather than at the end, this information is unobtainable when creating the CrawledDomain object. Also fix a bug in the deduplication logic in the DomainProcessor class that caused a test to break.
This commit is contained in:
parent
fa81e5b8ee
commit
cf935a5331
@ -101,7 +101,8 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
"",
|
||||
nextRecord.url,
|
||||
null,
|
||||
""));
|
||||
"",
|
||||
nextRecord.cookies));
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
|
@ -69,7 +69,6 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
redirectDomain = statusReason;
|
||||
}
|
||||
|
||||
// TODO: Fix cookies info somehow
|
||||
next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip,
|
||||
new ArrayList<>(),
|
||||
new ArrayList<>()
|
||||
@ -98,7 +97,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
response.payloadDigest().map(WarcDigest::base64).orElse(""),
|
||||
"",
|
||||
"",
|
||||
"");
|
||||
"",
|
||||
WarcXCookieInformationHeader.hasCookies(response)
|
||||
);
|
||||
} else if (parsedBody instanceof DocumentBodyResult.Ok<String> ok) {
|
||||
next = new CrawledDocument(
|
||||
"",
|
||||
@ -113,7 +114,8 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
response.payloadDigest().map(WarcDigest::base64).orElse(""),
|
||||
"",
|
||||
"",
|
||||
"");
|
||||
"",
|
||||
WarcXCookieInformationHeader.hasCookies(response));
|
||||
} else {
|
||||
// unreachable
|
||||
throw new IllegalStateException("Unknown body type: " + parsedBody);
|
||||
|
@ -30,6 +30,10 @@ public class CrawledDocument implements SerializableCrawlData {
|
||||
|
||||
public String recrawlState;
|
||||
|
||||
/** This is not guaranteed to be set in all versions of the format,
|
||||
* information may come in CrawledDomain instead */
|
||||
public Boolean hasCookies = false;
|
||||
|
||||
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
|
||||
@Override
|
||||
public String getSerialIdentifier() {
|
||||
|
@ -17,6 +17,9 @@ public class CrawledDomain implements SerializableCrawlData {
|
||||
public String ip;
|
||||
|
||||
public List<CrawledDocument> doc;
|
||||
|
||||
/** This is not guaranteed to be set in all versions of the format,
|
||||
* information may come in CrawledDocument instead */
|
||||
public List<String> cookies;
|
||||
|
||||
public int size() {
|
||||
|
@ -97,11 +97,15 @@ public class DomainProcessor {
|
||||
}
|
||||
else if (data instanceof CrawledDocument doc) {
|
||||
try {
|
||||
if (doc.url == null || processedUrls.add(doc.url))
|
||||
if (doc.url == null || !processedUrls.add(doc.url))
|
||||
continue;
|
||||
|
||||
fixBadCanonicalTag(doc);
|
||||
|
||||
if (Boolean.TRUE.equals(doc.hasCookies)) {
|
||||
cookies = true;
|
||||
}
|
||||
|
||||
// This case should never be reachable, as we should have initiated
|
||||
// the externalDomainLinks variable above if we made it past the
|
||||
// doc.url == null check; but we'll leave it here just in case
|
||||
|
@ -50,7 +50,8 @@ public class SideloaderProcessing {
|
||||
Integer.toHexString(url.hashCode()),
|
||||
url,
|
||||
"",
|
||||
"SIDELOAD"
|
||||
"SIDELOAD",
|
||||
false
|
||||
);
|
||||
|
||||
var ret = new ProcessedDocument();
|
||||
|
@ -65,6 +65,7 @@ public class ConvertingIntegrationTest {
|
||||
@Test
|
||||
public void testMemexMarginaliaNu() throws IOException {
|
||||
var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
||||
assertNotNull(ret);
|
||||
assertEquals(ret.state, DomainIndexingState.ACTIVE);
|
||||
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
|
||||
|
||||
@ -114,7 +115,8 @@ public class ConvertingIntegrationTest {
|
||||
Double.toString(Math.random()),
|
||||
"https://memex.marginalia.nu/" + file,
|
||||
null,
|
||||
""
|
||||
"",
|
||||
false
|
||||
);
|
||||
docs.add(doc);
|
||||
}
|
||||
|
@ -11,10 +11,13 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord;
|
||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
@ -31,6 +34,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
private HttpFetcher httpFetcher;
|
||||
|
||||
private Path fileName;
|
||||
private Path fileName2;
|
||||
|
||||
@SneakyThrows
|
||||
@BeforeAll
|
||||
@ -49,11 +53,13 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
domainProcessor = injector.getInstance(DomainProcessor.class);
|
||||
httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
|
||||
this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
||||
this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
Files.deleteIfExists(fileName);
|
||||
Files.deleteIfExists(fileName2);
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -90,7 +96,9 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||
}
|
||||
|
||||
try (var reader = new WarcSerializableCrawlDataStream(fileName)) {
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, fileName, fileName2);
|
||||
|
||||
try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) {
|
||||
while (reader.hasNext()) {
|
||||
data.add(reader.next());
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user