diff --git a/code/features-crawl/content-type/build.gradle b/code/features-crawl/content-type/build.gradle index 17eaea3f..73a155cb 100644 --- a/code/features-crawl/content-type/build.gradle +++ b/code/features-crawl/content-type/build.gradle @@ -21,6 +21,7 @@ dependencies { testImplementation libs.bundles.slf4j.test implementation libs.jsoup + implementation libs.commons.lang3 testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit diff --git a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java index 374788b4..095497c8 100644 --- a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java +++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java @@ -1,9 +1,28 @@ package nu.marginalia.contenttype; +import org.apache.commons.lang3.StringUtils; + /** Content type and charset of a document * @param contentType The content type, e.g. "text/html" * @param charset The charset, e.g. "UTF-8" */ public record ContentType(String contentType, String charset) { + public static ContentType parse(String contentTypeHeader) { + String[] parts = StringUtils.split(contentTypeHeader, ";", 2); + String contentType = parts[0].trim(); + String charset = parts.length > 1 ? parts[1].trim() : "UTF-8"; + return new ContentType(contentType, charset); + } + + public boolean is(String contentType) { + return this.contentType.equalsIgnoreCase(contentType); + } + + public String toString() { + if (charset == null || charset.isBlank()) + return contentType; + + return STR."\{contentType}; charset=\{charset}"; + } } diff --git a/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java index 13b982f5..67dd6366 100644 --- a/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java +++ b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java @@ -37,7 +37,9 @@ public class GeoIpDictionary { throw new RuntimeException(e); } finally { - this.notifyAll(); + synchronized (this) { + this.notifyAll(); + } } }); } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java index d7dfa6d1..d884dbe5 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java @@ -1,5 +1,6 @@ package nu.marginalia.crawling.body; +import nu.marginalia.contenttype.ContentType; import nu.marginalia.model.EdgeUrl; import java.util.List; @@ -37,6 +38,9 @@ public class ContentTypeLogic { return probableBinaryPattern.test(pathLowerCase); } + public boolean isAllowableContentType(ContentType contentType) { + return isAllowableContentType(contentType.contentType()); + } public boolean isAllowableContentType(String contentType) { if (allowAllContentTypes) return true; diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java index 00ceac86..a485e5bc 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java @@ -1,5 +1,6 @@ package nu.marginalia.crawling.body; +import nu.marginalia.contenttype.ContentType; import nu.marginalia.contenttype.ContentTypeParser; import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.crawling.model.CrawlerDocumentStatus; @@ -7,7 +8,6 @@ import org.apache.commons.io.input.BOMInputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.util.zip.GZIPInputStream; public class DocumentBodyExtractor { @@ -15,28 +15,38 @@ public class DocumentBodyExtractor { private static final Logger logger = LoggerFactory.getLogger(DocumentBodyExtractor.class); - public static DocumentBodyResult asString(HttpFetchResult result) { - if (result instanceof HttpFetchResult.ResultOk ok) { - return asString(ok); - } - else if (result instanceof HttpFetchResult.ResultRetained retained) { - return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body()); - } - - return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "Fetch Result Not Ok"); - } - + /** Extract the body from a fetch result as a byte array. */ public static DocumentBodyResult asBytes(HttpFetchResult result) { if (result instanceof HttpFetchResult.ResultOk fetchOk) { return asBytes(fetchOk); } - else if (result instanceof HttpFetchResult.ResultRetained retained) { + else if (result instanceof HttpFetchResult.Result304ReplacedWithReference retained) { return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body().getBytes()); } return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "Fetch Result Not Ok"); } + /** Extract the body from a fetch result as a string. This function performs + * content-type checks to ensure that the content-type is such that this operation + * makes sense. + * + * @see ContentTypeLogic#isAllowableContentType(String) + * */ + public static DocumentBodyResult asString(HttpFetchResult result) { + return asBytes(result).flatMap(DocumentBodyExtractor::toStringResult); + } + + private static DocumentBodyResult toStringResult(ContentType contentType, byte[] bytes) { + if (contentTypeLogic.isAllowableContentType(contentType)) { + return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes)); + } + else { + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + } + } + + /** Extract the body from a fetch result as a byte array. */ public static DocumentBodyResult asBytes(HttpFetchResult.ResultOk rsp) { try { var byteStream = rsp.getInputStream(); @@ -51,44 +61,11 @@ public class DocumentBodyExtractor { byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); - return new DocumentBodyResult.Ok<>(contentType.contentType(), data); + return new DocumentBodyResult.Ok<>(contentType, data); } catch (Exception ex) { logger.error("Failed to extract body", ex); return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, ""); } } - public static DocumentBodyResult asString(HttpFetchResult.ResultOk rsp) { - try { - var byteStream = rsp.getInputStream(); - - if ("gzip".equals(rsp.header("Content-Encoding"))) { - byteStream = new GZIPInputStream(byteStream); - } - byteStream = new BOMInputStream(byteStream); - - var contentTypeHeader = rsp.header("Content-Type"); - if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { - return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); - } - - byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder - - var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); - if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) { - return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); - } - - if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) { - return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CHARSET, ""); - } - - return new DocumentBodyResult.Ok<>(contentType.contentType(), DocumentBodyToString.getStringData(contentType, data)); - } - catch (IOException ex) { - logger.error("Failed to extract body", ex); - return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, ""); - } - } - } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java index 0f30dc1f..04e3fedb 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java @@ -1,17 +1,27 @@ package nu.marginalia.crawling.body; +import nu.marginalia.contenttype.ContentType; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import java.util.Optional; import java.util.function.BiFunction; public sealed interface DocumentBodyResult { - record Ok(String contentType, T body) implements DocumentBodyResult { + record Ok(ContentType contentType, T body) implements DocumentBodyResult { @Override - public Optional mapOpt(BiFunction mapper) { + public Optional mapOpt(BiFunction mapper) { return Optional.of(mapper.apply(contentType, body)); } + @Override + public Optional flatMapOpt(BiFunction> mapper) { + return mapper.apply(contentType, body); + } + + @Override + public DocumentBodyResult flatMap(BiFunction> mapper) { + return mapper.apply(contentType, body); + } @Override public void ifPresent(ExConsumer consumer) throws Exception { @@ -20,20 +30,29 @@ public sealed interface DocumentBodyResult { } record Error(CrawlerDocumentStatus status, String why) implements DocumentBodyResult { @Override - public Optional mapOpt(BiFunction mapper) { + public Optional mapOpt(BiFunction mapper) { return Optional.empty(); } + public Optional flatMapOpt(BiFunction> mapper) { return Optional.empty(); } + + @Override + @SuppressWarnings("unchecked") + public DocumentBodyResult flatMap(BiFunction> mapper) { + return (DocumentBodyResult) this; + } @Override public void ifPresent(ExConsumer consumer) throws Exception { } } - Optional mapOpt(BiFunction mapper); + Optional mapOpt(BiFunction mapper); + Optional flatMapOpt(BiFunction> mapper); + DocumentBodyResult flatMap(BiFunction> mapper); void ifPresent(ExConsumer consumer) throws Exception; interface ExConsumer { - void accept(String contentType, T t) throws E; + void accept(ContentType contentType, T t) throws E; } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java index 40db21a5..f0db28e8 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java @@ -1,5 +1,6 @@ package nu.marginalia.crawling.body; +import nu.marginalia.contenttype.ContentType; import okhttp3.Headers; import org.jsoup.Jsoup; import org.netpreserve.jwarc.MessageHeaders; @@ -11,13 +12,15 @@ import java.io.IOException; import java.io.InputStream; import java.net.InetAddress; import java.net.URI; -import java.util.List; import java.util.Optional; +/* FIXME: This interface has a very unfortunate name that is not very descriptive. + */ public sealed interface HttpFetchResult { boolean isOk(); + /** Convert a WarcResponse to a HttpFetchResult */ static HttpFetchResult importWarc(WarcResponse response) { try { var http = response.http(); @@ -47,6 +50,10 @@ public sealed interface HttpFetchResult { } + /** Corresponds to a successful retrieval of a document + * from the remote server. Note that byte[] is only borrowed + * and subsequent calls may overwrite the contents of this buffer. + */ record ResultOk(URI uri, int statusCode, Headers headers, @@ -85,23 +92,29 @@ public sealed interface HttpFetchResult { } public Optional parseDocument() throws IOException { - return switch(DocumentBodyExtractor.asString(this)) { - case DocumentBodyResult.Ok ok when "text/html".equalsIgnoreCase(ok.contentType()) - -> Optional.of(Jsoup.parse(ok.body())); - default -> Optional.empty(); - }; + return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> { + if (contentType.is("text/html")) { + return Optional.of(Jsoup.parse(body)); + } + else { + return Optional.empty(); + } + }); } public String header(String name) { return headers.get(name); } - public List allHeaders(String name) { - return headers.values(name); - } - }; - record ResultRetained(String url, String contentType, String body) implements HttpFetchResult { + + /** This is a special case where the document was not fetched + * because it was already in the database. In this case, we + * replace the original data. + * + * @see Result304Raw for the case where the document has not yet been replaced with the reference data. + */ + record Result304ReplacedWithReference(String url, ContentType contentType, String body) implements HttpFetchResult { public boolean isOk() { return true; @@ -116,16 +129,29 @@ public sealed interface HttpFetchResult { } } }; + + /** Fetching resulted in an exception */ record ResultException(Exception ex) implements HttpFetchResult { public boolean isOk() { return false; } }; - record ResultSame() implements HttpFetchResult { + + /** Fetching resulted in a HTTP 304, the remote content is identical to + * our reference copy. This will be replaced with a Result304ReplacedWithReference + * at a later stage. + * + * @see Result304ReplacedWithReference + */ + record Result304Raw() implements HttpFetchResult { public boolean isOk() { return false; } }; + + /** No result. This is typically injected at a later stage + * of processing, e.g. after filtering out irrelevant responses. + */ record ResultNone() implements HttpFetchResult { public boolean isOk() { return false; diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 0da0b790..eb7ffd75 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -1,8 +1,9 @@ package nu.marginalia.crawling.io; import com.google.gson.Gson; -import nu.marginalia.crawling.io.format.LegacyFileReadingSerializableCrawlDataStream; -import nu.marginalia.crawling.io.format.WarcReadingSerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.LegacySerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream; import nu.marginalia.model.gson.GsonFactory; import java.io.*; @@ -19,10 +20,13 @@ public class CrawledDomainReader { public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { String fileName = fullPath.getFileName().toString(); if (fileName.endsWith(".zstd")) { - return new LegacyFileReadingSerializableCrawlDataStream(gson, fullPath.toFile()); + return new LegacySerializableCrawlDataStream(gson, fullPath.toFile()); } else if (fileName.endsWith(".warc") || fileName.endsWith(".warc.gz")) { - return new WarcReadingSerializableCrawlDataStream(fullPath); + return new WarcSerializableCrawlDataStream(fullPath); + } + else if (fileName.endsWith(".parquet")) { + return new ParquetSerializableCrawlDataStream(fullPath); } else { throw new IllegalArgumentException("Unknown file type: " + fullPath); @@ -31,8 +35,12 @@ public class CrawledDomainReader { /** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */ public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException { + Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain); Path warcPath = CrawlerOutputFile.getWarcPath(basePath, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL); + if (Files.exists(parquetPath)) { + return createDataStream(parquetPath); + } if (Files.exists(warcPath)) { return createDataStream(warcPath); } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java index 2a0029b4..ad6b4358 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java @@ -74,6 +74,15 @@ public class CrawlerOutputFile { } return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet"); } + public static Path getParquetPath(Path basePath, String id, String domain) { + id = padId(id); + + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = basePath.resolve(first).resolve(second); + return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet"); + } public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) { id = padId(id); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacyFileReadingSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java similarity index 86% rename from code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacyFileReadingSerializableCrawlDataStream.java rename to code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java index efff17f3..bfd52b78 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacyFileReadingSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java @@ -11,13 +11,16 @@ import nu.marginalia.crawling.model.SerializableCrawlData; import java.io.*; import java.nio.file.Path; -public class LegacyFileReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { +/** This class is used to read the old format of crawl data, which was zstd-compressed JSON + * with type delimiters between records. + */ +public class LegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { private final Gson gson; private final BufferedReader bufferedReader; private SerializableCrawlData next = null; private final Path path; - public LegacyFileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException { + public LegacySerializableCrawlDataStream(Gson gson, File file) throws IOException { this.gson = gson; bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); path = file.toPath(); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java new file mode 100644 index 00000000..0b852e01 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -0,0 +1,118 @@ +package nu.marginalia.crawling.io.format; + +import lombok.SneakyThrows; +import nu.marginalia.contenttype.ContentType; +import nu.marginalia.contenttype.DocumentBodyToString; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.CrawlerDomainStatus; +import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; +import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.util.*; + +public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class); + + private final Iterator backingIterator; + private Deque nextQ = new ArrayDeque<>(); + private boolean wroteDomainRecord = false; + private final Path path; + + public ParquetSerializableCrawlDataStream(Path file) throws IOException { + path = file; + + backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator(); + } + + @Override + public Path path() { + return path; + } + + @Override + @SneakyThrows + public boolean hasNext() { + while (backingIterator.hasNext() && nextQ.isEmpty()) { + var nextRecord = backingIterator.next(); + if (!wroteDomainRecord) { + createDomainRecord(nextRecord); + wroteDomainRecord = true; + } + createDocumentRecord(nextRecord); + } + return !nextQ.isEmpty(); + } + + private void createDomainRecord(CrawledDocumentParquetRecord parquetRecord) throws URISyntaxException { + + CrawlerDomainStatus status = CrawlerDomainStatus.OK; + String statusReason = ""; + + String redirectDomain = null; + if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redir")) { + EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url); + redirectDomain = crawledUrl.getDomain().toString(); + status = CrawlerDomainStatus.REDIRECT; + } + else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=blocked")) { + status = CrawlerDomainStatus.BLOCKED; // FIXME we don't write this yet + } + else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=error")) { + status = CrawlerDomainStatus.ERROR; + statusReason = new String(parquetRecord.body); + } + + // FIXME -- cookies + nextQ.add(new CrawledDomain( + parquetRecord.domain, + redirectDomain, + status.toString(), + statusReason, + parquetRecord.ip, + new ArrayList<>(), + new ArrayList<>() + )); + } + + private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) { + String bodyString = DocumentBodyToString.getStringData( + ContentType.parse(nextRecord.contentType), + nextRecord.body); + + // FIXME -- a lot of these fields are not set properly! + nextQ.add(new CrawledDocument("", + nextRecord.url, + nextRecord.contentType, + "", + nextRecord.httpStatus, + "OK", + "", + "", + bodyString, + "", + nextRecord.url, + null, + "")); + } + + public void close() throws IOException { + } + + @Override + public SerializableCrawlData next() throws IOException { + if (!hasNext()) + throw new NoSuchElementException(); + + return nextQ.poll(); + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java similarity index 87% rename from code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java rename to code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java index 9c81f0ca..a766a58d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java @@ -14,20 +14,17 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.StringJoiner; +import java.util.*; -public class WarcReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { - private static final Logger logger = LoggerFactory.getLogger(WarcReadingSerializableCrawlDataStream.class); +public class WarcSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private static final Logger logger = LoggerFactory.getLogger(WarcSerializableCrawlDataStream.class); private final WarcReader reader; private final Iterator backingIterator; private SerializableCrawlData next = null; private final Path path; - public WarcReadingSerializableCrawlDataStream(Path file) throws IOException { + public WarcSerializableCrawlDataStream(Path file) throws IOException { path = file; reader = new WarcReader(file); WarcXResponseReference.register(reader); @@ -51,17 +48,10 @@ public class WarcReadingSerializableCrawlDataStream implements AutoCloseable, Se else if (nextRecord instanceof Warcinfo warcinfo) { convertWarcinfo(warcinfo); } - else if (nextRecord instanceof WarcMetadata metadata) { - convertMetadata(metadata); - } } return next != null; } - private void convertMetadata(WarcMetadata metadata) { - // Nothing to do here for now - } - private void convertWarcinfo(Warcinfo warcinfo) throws IOException { var headers = warcinfo.fields(); String probeStatus = headers.first("X-WARC-Probe-Status").orElse(""); @@ -79,7 +69,10 @@ public class WarcReadingSerializableCrawlDataStream implements AutoCloseable, Se } // TODO: Fix cookies info somehow - next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip, List.of(), List.of()); + next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip, + new ArrayList<>(), + new ArrayList<>() + ); } private void convertResponse(WarcResponse response) throws IOException { @@ -109,7 +102,7 @@ public class WarcReadingSerializableCrawlDataStream implements AutoCloseable, Se next = new CrawledDocument( "", response.targetURI().toString(), - ok.contentType(), + ok.contentType().toString(), response.date().toString(), http.status(), "OK", diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index d9fea865..60e0178e 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -4,10 +4,7 @@ import blue.strategic.parquet.ParquetWriter; import nu.marginalia.crawling.body.DocumentBodyExtractor; import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.crawling.body.HttpFetchResult; -import org.netpreserve.jwarc.WarcReader; -import org.netpreserve.jwarc.WarcRecord; -import org.netpreserve.jwarc.WarcResponse; -import org.netpreserve.jwarc.WarcXResponseReference; +import org.netpreserve.jwarc.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,7 +22,16 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { WarcXResponseReference.register(warcReader); for (var record : warcReader) { - parquetWriter.write(domain, record); + if (record instanceof WarcResponse response) { + parquetWriter.write(domain, response); + } + else if (record instanceof Warcinfo warcinfo) { + parquetWriter.write(domain, warcinfo); + } + else { + logger.warn("Skipping record of type {}", record.type()); + } + } } catch (Exception ex) { @@ -33,6 +39,34 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { } } + private void write(String domain, Warcinfo warcinfo) throws IOException { + String selfDomain = warcinfo.fields().first("domain").orElse(""); + String ip = warcinfo.fields().first("ip").orElse(""); + String probeStatus = warcinfo.fields().first("X-WARC-Probe-Status").orElse(""); + + if (probeStatus.startsWith("REDIRECT")) { + String redirectDomain = probeStatus.substring("REDIRECT;".length()); + write(new CrawledDocumentParquetRecord(selfDomain, + STR."https://\{redirectDomain}/", + ip, + false, + 0, + "x-marginalia/advisory;state=redirect", + new byte[0] + )); + } + else if (!"OK".equals(probeStatus)) { + write(new CrawledDocumentParquetRecord(selfDomain, + STR."https://\{domain}/", + ip, + false, + 0, + "x-marginalia/advisory;state=error", + probeStatus.getBytes() + )); + } + } + public CrawledDocumentParquetRecordFileWriter(Path file) throws IOException { writer = ParquetWriter.writeFile(CrawledDocumentParquetRecord.schema, file.toFile(), CrawledDocumentParquetRecord.newDehydrator()); @@ -42,12 +76,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { writer.write(domainData); } - public void write(String domain, WarcRecord record) throws IOException { - if (!(record instanceof WarcResponse ref)) { - return; - } + public void write(String domain, WarcResponse response) throws IOException { - HttpFetchResult result = HttpFetchResult.importWarc(ref); + HttpFetchResult result = HttpFetchResult.importWarc(response); if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) { return; } @@ -59,7 +90,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { if (body instanceof DocumentBodyResult.Ok bodyOk) { bodyBytes = bodyOk.body(); - contentType = bodyOk.contentType(); + contentType = bodyOk.contentType().toString(); } else { bodyBytes = new byte[0]; @@ -68,7 +99,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { write(new CrawledDocumentParquetRecord( domain, - ref.target(), + response.target(), fetchOk.ipAddress(), false, // FIXME fetchOk.statusCode(), diff --git a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java index f8661355..15338de1 100644 --- a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -1,5 +1,9 @@ package nu.marginalia.crawling.parquet; +import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -8,6 +12,7 @@ import org.netpreserve.jwarc.net.WarcRecorder; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; import static org.junit.jupiter.api.Assertions.*; @@ -25,7 +30,7 @@ class CrawledDocumentParquetRecordFileWriterTest { } @Test - void write() throws IOException { + void testWriteRead() throws IOException { var original = new CrawledDocumentParquetRecord("www.marginalia.nu", "https://www.marginalia.nu/", "127.0.0.1", @@ -38,10 +43,36 @@ class CrawledDocumentParquetRecordFileWriterTest { writer.write(original); } - try (var stream = CrawledDocumentParquetRecordFileReader.stream(tempFile)) { - var actual = stream.findFirst().orElseThrow(); - assertEquals(original, actual); + var items = new ArrayList(); + + try (var stream = new ParquetSerializableCrawlDataStream(tempFile)) { + while (stream.hasNext()) { + items.add(stream.next()); + } } + + assertEquals(2, items.size()); + + var firstItem = items.get(0); + assertInstanceOf(CrawledDomain.class, firstItem); + var domain = (CrawledDomain) firstItem; + assertEquals("www.marginalia.nu", domain.domain); + assertNull(domain.redirectDomain); + assertEquals("OK", domain.crawlerStatus); + assertEquals("", domain.crawlerStatusDesc); + assertEquals(new ArrayList<>(), domain.doc); + assertEquals(new ArrayList<>(), domain.cookies); + + var secondItem = items.get(1); + assertInstanceOf(CrawledDocument.class, secondItem); + + var document = (CrawledDocument) secondItem; + assertEquals("https://www.marginalia.nu/", document.url); + assertEquals("text/html", document.contentType); + assertEquals("hello world", document.documentBody); + assertEquals(200, document.httpStatus); + assertEquals("https://www.marginalia.nu/", document.canonicalUrl); } + } \ No newline at end of file diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index fea8f69a..2f0fc690 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -18,6 +18,7 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import nu.marginalia.model.crawl.HtmlFeature; +import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -53,9 +54,15 @@ public class DomainProcessor { } @SneakyThrows + @Nullable public ProcessedDomain process(SerializableCrawlDataStream dataStream) { + if (!dataStream.hasNext()) { + return null; + } + var ret = new ProcessedDomain(); List docs = new ArrayList<>(); + Set processedUrls = new HashSet<>(); boolean cookies = false; String ip = ""; @@ -90,7 +97,7 @@ public class DomainProcessor { } else if (data instanceof CrawledDocument doc) { try { - if (doc.url == null) + if (doc.url == null || processedUrls.add(doc.url)) continue; fixBadCanonicalTag(doc); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java index 1ca66ed6..3069c5ed 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.writer; import lombok.SneakyThrows; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.worklog.BatchingWorkLog; +import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,7 +42,10 @@ public class ConverterWriter implements AutoCloseable { } @SneakyThrows - public void accept(ProcessedDomain domain) { + public void accept(@Nullable ProcessedDomain domain) { + if (null == domain) + return; + domainData.put(domain); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 67b4f7b6..844062bb 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -11,16 +11,16 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.crawlspec.CrawlSpecRecord; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.*; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; @@ -30,6 +30,8 @@ public class CrawlingThenConvertingIntegrationTest { private DomainProcessor domainProcessor; private HttpFetcher httpFetcher; + private Path fileName; + @SneakyThrows @BeforeAll public static void setUpAll() { @@ -46,6 +48,12 @@ public class CrawlingThenConvertingIntegrationTest { domainProcessor = injector.getInstance(DomainProcessor.class); httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString()); + this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(fileName); } @Test @@ -78,10 +86,16 @@ public class CrawlingThenConvertingIntegrationTest { private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException { List data = new ArrayList<>(); - try (var recorder = new WarcRecorder()) { + try (var recorder = new WarcRecorder(fileName)) { new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); } + try (var reader = new WarcSerializableCrawlDataStream(fileName)) { + while (reader.hasNext()) { + data.add(reader.next()); + } + } + CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add); return domain; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 5c6241f7..658acfbe 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -244,12 +244,10 @@ public class CrawlerMain { // (mostly a case when migrating from legacy->warc) reference.delete(); - Files.move(newWarcFile, finalWarcFile, StandardCopyOption.REPLACE_EXISTING); - CrawledDocumentParquetRecordFileWriter - .convertWarc(domain, finalWarcFile, parquetFile); + .convertWarc(domain, newWarcFile, parquetFile); - workLog.setJobToFinished(domain, finalWarcFile.toString(), size); + workLog.setJobToFinished(domain, parquetFile.toString(), size); heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); logger.info("Fetched {}", domain); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 80d6853b..668f597a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -5,6 +5,7 @@ import com.google.common.hash.Hashing; import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.contenttype.ContentType; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawling.body.HttpFetchResult; @@ -247,11 +248,13 @@ public class CrawlerRetreiver implements AutoCloseable { var contentTags = reference.getContentTags(); var fetchedDoc = tryDownload(top, timer, contentTags); - if (fetchedDoc instanceof HttpFetchResult.ResultSame) { + if (fetchedDoc instanceof HttpFetchResult.Result304Raw) { var doc = reference.doc(); if (doc != null) { warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody); - fetchedDoc = new HttpFetchResult.ResultRetained(doc.url, doc.contentType, doc.documentBody); + fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url, + new ContentType(doc.contentType, "UTF-8"), + doc.documentBody); } } @@ -265,7 +268,7 @@ public class CrawlerRetreiver implements AutoCloseable { crawlFrontier.addVisited(new EdgeUrl(ok.uri())); } } - else if (fetchedDoc instanceof HttpFetchResult.ResultRetained retained) { + else if (fetchedDoc instanceof HttpFetchResult.Result304ReplacedWithReference retained) { var docOpt = retained.parseDocument(); if (docOpt.isPresent()) { var doc = docOpt.get(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index f8f11b13..cc4a195d 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -12,7 +12,6 @@ import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.body.ContentTypeLogic; @@ -183,7 +182,7 @@ public class HttpFetcherImpl implements HttpFetcher { throw new RateLimitException(retryAfter); } if (ok.statusCode() == 304) { - return new HttpFetchResult.ResultSame(); + return new HttpFetchResult.Result304Raw(); } if (ok.statusCode() == 200) { return ok; @@ -268,7 +267,7 @@ public class HttpFetcherImpl implements HttpFetcher { return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) -> robotsParser.parseContent(url.toString(), body, - contentType, + contentType.toString(), userAgent) ); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index bce1b890..a1335eb8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -69,8 +69,12 @@ public class WarcRecorder implements AutoCloseable { temporaryFile = true; } - public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { - URI uri = request.url().uri(); + public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException, + IOException, + URISyntaxException, + InterruptedException + { + URI requestUri = request.url().uri(); WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder(); @@ -133,7 +137,11 @@ public class WarcRecorder implements AutoCloseable { } } - WarcResponse.Builder responseBuilder = new WarcResponse.Builder(uri) + // It looks like this might be the same as requestUri, but it's not; + // it's the URI after resolving redirects. + final URI responseUri = response.request().url().uri(); + + WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri) .blockDigest(responseDigestBuilder.build()) .date(date) .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); @@ -155,11 +163,11 @@ public class WarcRecorder implements AutoCloseable { WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder(); - String httpRequestString = WarcProtocolReconstructor.getHttpRequestString(response.request(), uri); + String httpRequestString = WarcProtocolReconstructor.getHttpRequestString(response.request(), requestUri); requestDigestBuilder.update(httpRequestString); - WarcRequest warcRequest = new WarcRequest.Builder(uri) + WarcRequest warcRequest = new WarcRequest.Builder(requestUri) .blockDigest(requestDigestBuilder.build()) .date(date) .body(MediaType.HTTP_REQUEST, httpRequestString.getBytes()) @@ -168,7 +176,7 @@ public class WarcRecorder implements AutoCloseable { warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it writer.write(warcRequest); - return new HttpFetchResult.ResultOk(uri, + return new HttpFetchResult.ResultOk(responseUri, response.code(), response.headers(), ip, @@ -177,7 +185,7 @@ public class WarcRecorder implements AutoCloseable { responseDataBuffer.length() - dataStart); } catch (Exception ex) { - logger.warn("Failed to fetch URL {}", uri, ex); + logger.warn("Failed to fetch URL {}", requestUri, ex); return new HttpFetchResult.ResultException(ex); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java index 31df5e0e..a0559aec 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -23,9 +23,9 @@ public record DocumentWithReference( * or if the result was retained via HTTP 304. */ public boolean isSame(HttpFetchResult result) { - if (result instanceof HttpFetchResult.ResultSame) + if (result instanceof HttpFetchResult.Result304Raw) return true; - if (result instanceof HttpFetchResult.ResultRetained) + if (result instanceof HttpFetchResult.Result304ReplacedWithReference) return true; if (!(result instanceof HttpFetchResult.ResultOk resultOk)) diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index e8ba9437..0375f5cb 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -53,8 +53,6 @@ class WarcRecorderTest { .addHeader("Accept-Encoding", "gzip") .get().build()); - new GZIPInputStream(Files.newInputStream(fileNameWarc)).transferTo(System.out); - Map sampleData = new HashMap<>(); try (var warcReader = new WarcReader(fileNameWarc)) { warcReader.forEach(record -> { @@ -91,8 +89,6 @@ class WarcRecorderTest { } } } - - new GZIPInputStream(Files.newInputStream(fileNameWarc)).transferTo(System.out); } @Test