(warc) Further tidying

This commit includes mostly exception handling, error propagation, a few bug fixes and minor changes to log formatting. The CrawlDelayTimer, HTTP 429 responses and IOException responses are now more accurately handled. A non-standard WarcXEntityRefused WARC record has also been introduced, essentially acting as a rejected 'response' with different semantics. Besides these, several existing features have been refined, such as URL encoding, crawl depth incrementing and usage of Content-Length headers.
2023-12-15 15:38:23 +01:00 · 2023-12-15 15:38:23 +01:00 · 9fea22b90d
commit 9fea22b90d
parent 0889b6d247
12 changed files with 245 additions and 114 deletions
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java
@ -28,6 +28,7 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
        path = file;
        reader = new WarcReader(file);
        WarcXResponseReference.register(reader);
+        WarcXEntityRefused.register(reader);

        backingIterator = reader.iterator();
    }
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java
@ -9,29 +9,34 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.IOException;
+import java.net.URI;
 import java.nio.file.Path;

 public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
    private final ParquetWriter<CrawledDocumentParquetRecord> writer;
    private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class);

-    public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) throws IOException {
+    public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) {
        try (var warcReader = new WarcReader(warcInputFile);
             var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile)
        ) {
            WarcXResponseReference.register(warcReader);
+            WarcXEntityRefused.register(warcReader);

            for (var record : warcReader) {
                if (record instanceof WarcResponse response) {
+                    // this also captures WarcXResponseReference, which inherits from WarcResponse
+                    // and is used to store old responses from previous crawls; in this part of the logic
+                    // we treat them the same as a normal response
+
                    parquetWriter.write(domain, response);
                }
+                else if (record instanceof WarcXEntityRefused refused) {
+                    parquetWriter.write(domain, refused);
+                }
                else if (record instanceof Warcinfo warcinfo) {
-                    parquetWriter.write(domain, warcinfo);
+                    parquetWriter.write(warcinfo);
                }
-                else {
-                    logger.warn("Skipping record of type {}", record.type());
-                }
-
            }
        }
        catch (Exception ex) {
@ -39,31 +44,40 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
        }
    }

-    private void write(String domain, Warcinfo warcinfo) throws IOException {
+    private void write(String domain, WarcXEntityRefused refused) throws IOException {
+        URI profile = refused.profile();
+
+        String meta;
+        if (profile.equals(WarcXEntityRefused.documentRobotsTxtSkippedURN)) {
+            meta = "x-marginalia/advisory;state=robots-txt-skipped";
+        }
+        else if (profile.equals(WarcXEntityRefused.documentBadContentTypeURN)) {
+            meta = "x-marginalia/advisory;state=content-type-failed-probe";
+        }
+        else if (profile.equals(WarcXEntityRefused.documentProbeTimeout)) {
+            meta = "x-marginalia/advisory;state=timeout-probe";
+        }
+        else if (profile.equals(WarcXEntityRefused.documentUnspecifiedError)) {
+            meta = "x-marginalia/advisory;state=doc-error";
+        }
+        else {
+            meta = "x-marginalia/advisory;state=unknown";
+        }
+
+        write(forDocError(domain, refused.target(), meta));
+    }
+
+    private void write(Warcinfo warcinfo) throws IOException {
        String selfDomain = warcinfo.fields().first("domain").orElse("");
        String ip = warcinfo.fields().first("ip").orElse("");
        String probeStatus = warcinfo.fields().first("X-WARC-Probe-Status").orElse("");

        if (probeStatus.startsWith("REDIRECT")) {
            String redirectDomain = probeStatus.substring("REDIRECT;".length());
-            write(new CrawledDocumentParquetRecord(selfDomain,
-                    STR."https://\{redirectDomain}/",
-                    ip,
-                    false,
-                    0,
-                    "x-marginalia/advisory;state=redirect",
-                    new byte[0]
-            ));
+            write(forDomainRedirect(selfDomain, redirectDomain));
        }
        else if (!"OK".equals(probeStatus)) {
-            write(new CrawledDocumentParquetRecord(selfDomain,
-                    STR."https://\{domain}/",
-                    ip,
-                    false,
-                    0,
-                    "x-marginalia/advisory;state=error",
-                    probeStatus.getBytes()
-            ));
+            write(forDomainError(selfDomain, ip, probeStatus));
        }
    }

@ -83,6 +97,15 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
            return;
        }

+        // We don't want to store robots.txt files, as they are not
+        // interesting for the analysis we want to do.  This is important
+        // since txt-files in general are interesting, and we don't want to
+        // exclude them as a class.
+
+        if (fetchOk.uri().getPath().equals("/robots.txt")) {
+            return;
+        }
+
        byte[] bodyBytes;
        String contentType;

@ -112,4 +135,36 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
    public void close() throws IOException {
        writer.close();
    }
+
+    private CrawledDocumentParquetRecord forDomainRedirect(String domain, String redirectDomain) {
+        return new CrawledDocumentParquetRecord(domain,
+                STR."https://\{redirectDomain}/",
+                "",
+                false,
+                0,
+                "x-marginalia/advisory;state=redirect",
+                new byte[0]
+        );
+    }
+    private CrawledDocumentParquetRecord forDomainError(String domain, String ip, String errorStatus) {
+        return new CrawledDocumentParquetRecord(domain,
+                STR."https://\{domain}/",
+                ip,
+                false,
+                0,
+                "x-marginalia/advisory;state=error",
+                errorStatus.getBytes()
+        );
+    }
+
+    private CrawledDocumentParquetRecord forDocError(String domain, String url, String errorStatus) {
+        return new CrawledDocumentParquetRecord(domain,
+                url,
+                "",
+                false,
+                0,
+                "x-marginalia/advisory;state=error",
+                errorStatus.getBytes()
+        );
+    }
 }
--- a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java
+++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java
@ -0,0 +1,45 @@
+package org.netpreserve.jwarc;
+
+import java.io.IOException;
+import java.net.URI;
+
+/** This defines a non-standard extension to WARC for storing old HTTP responses,
+ * essentially a 'response' with different semantics
+ */
+public class WarcXEntityRefused extends WarcRevisit {
+    private static final String TYPE_NAME = "x-entity-refused";
+
+    public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped");
+    public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe");
+    public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe");
+    public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error");
+
+    WarcXEntityRefused(MessageVersion version, MessageHeaders headers, MessageBody body) {
+        super(version, headers, body);
+    }
+
+    public static void register(WarcReader reader) {
+        reader.registerType(TYPE_NAME, WarcXEntityRefused::new);
+    }
+
+    public static class Builder extends AbstractBuilder<WarcXEntityRefused, Builder> {
+        public Builder(URI targetURI, URI profile) {
+            this(targetURI.toString(), profile.toString());
+        }
+
+        public Builder(String targetURI, String profileURI) {
+            super(TYPE_NAME);
+            setHeader("WARC-Target-URI", targetURI);
+            setHeader("WARC-Profile", profileURI);
+        }
+
+        public Builder body(HttpResponse httpResponse) throws IOException {
+            return body(MediaType.HTTP_RESPONSE, httpResponse);
+        }
+
+        @Override
+        public WarcXEntityRefused build() {
+            return build(WarcXEntityRefused::new);
+        }
+    }
+}
--- a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java
+++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java
@ -4,9 +4,7 @@ import java.io.IOException;
 import java.net.URI;

 /** This defines a non-standard extension to WARC for storing old HTTP responses,
- * essentially a 'revisit' with a full body, which is not something that is
- * expected by the jwarc parser, and goes against the semantics of the revisit
- * records a fair bit.
+ * essentially a 'response' with different semantics..
 * <p>
 * An x-response-reference record is a response record with a full body, where
 * the data is a reconstructed HTTP response from a previous crawl.
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
@ -5,6 +5,8 @@ import com.google.common.hash.Hashing;
 import nu.marginalia.crawling.io.SerializableCrawlDataStream;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.lsh.EasyLSH;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import javax.annotation.Nullable;
 import java.io.IOException;
@ -15,6 +17,7 @@ import java.nio.file.Path;
 public class CrawlDataReference implements AutoCloseable {

    private final SerializableCrawlDataStream data;
+    private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);

    public CrawlDataReference(SerializableCrawlDataStream data) {
        this.data = data;
@ -43,8 +46,9 @@ public class CrawlDataReference implements AutoCloseable {
            }
        }
        catch (IOException ex) {
-            ex.printStackTrace();
+            logger.error("Failed to read next document", ex);
        }
+
        return null;
    }

--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
@ -20,8 +20,18 @@ public class CrawlDelayTimer {
        this.delayTime = delayTime;
    }

+    /** Call when we've gotten an HTTP 429 response.  This will wait a moment, and then
+     * set a flag that slows down the main crawl delay as well. */
+    public void waitRetryDelay(RateLimitException ex) throws InterruptedException {
+        slowDown = true;
+
+        int delay = ex.retryAfter();
+
+        Thread.sleep(Math.clamp(delay, 100, 5000));
+    }
+
    @SneakyThrows
-    public void delay(long spentTime) {
+    public void waitFetchDelay(long spentTime) {
        long sleepTime = delayTime;

        if (sleepTime >= 1) {
@ -30,10 +40,6 @@ public class CrawlDelayTimer {

            Thread.sleep(min(sleepTime - spentTime, 5000));
        }
-        else if (slowDown) {
-            // Additional delay when the server is signalling it wants slower requests
-            Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
-        }
        else {
            // When no crawl delay is specified, lean toward twice the fetch+process time,
            // within sane limits. This means slower servers get slower crawling, and faster
@ -48,10 +54,10 @@ public class CrawlDelayTimer {

            Thread.sleep(sleepTime - spentTime);
        }
-    }

-    /** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */
-    public void slowDown() {
-        slowDown = true;
+        if (slowDown) {
+            // Additional delay when the server is signalling it wants slower requests
+            Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
+        }
    }
 }
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -3,7 +3,6 @@ package nu.marginalia.crawl.retreival;
 import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import crawlercommons.robots.SimpleRobotRules;
-import lombok.SneakyThrows;
 import nu.marginalia.atags.model.DomainLinks;
 import nu.marginalia.contenttype.ContentType;
 import nu.marginalia.crawl.retreival.fetcher.ContentTags;
@ -19,6 +18,7 @@ import nu.marginalia.ip_blocklist.UrlBlocklist;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawlspec.CrawlSpecRecord;
+import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -32,6 +32,7 @@ import java.util.*;
 public class CrawlerRetreiver implements AutoCloseable {

    private static final int MAX_ERRORS = 20;
+    private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once

    private final HttpFetcher fetcher;

@ -40,7 +41,6 @@ public class CrawlerRetreiver implements AutoCloseable {
    private static final LinkParser linkParser = new LinkParser();
    private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);

-    private static final HashFunction hashMethod = Hashing.murmur3_128(0);
    private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
    private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();

@ -104,7 +104,7 @@ public class CrawlerRetreiver implements AutoCloseable {
        resync.run(warcFile);
    }

-    private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException {
+    private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException {
        String ip = findIp(domain);

        EdgeUrl rootUrl;
@ -124,7 +124,7 @@ public class CrawlerRetreiver implements AutoCloseable {
        final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain, warcRecorder);
        final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());

-        sniffRootDocument(delayTimer, rootUrl);
+        sniffRootDocument(rootUrl);

        // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
        int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer);
@ -181,8 +181,14 @@ public class CrawlerRetreiver implements AutoCloseable {
                continue;


-            if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) {
-                fetchedCount++;
+            try {
+                if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) {
+                    fetchedCount++;
+                }
+            }
+            catch (InterruptedException ex) {
+                Thread.currentThread().interrupt();
+                break;
            }
        }

@ -192,17 +198,17 @@ public class CrawlerRetreiver implements AutoCloseable {
    }

    /** Using the old crawl data, fetch the documents comparing etags and last-modified */
-    private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) {
+    private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) throws InterruptedException {
        return crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
    }

-    private void sniffRootDocument(CrawlDelayTimer delayTimer, EdgeUrl rootUrl) {
+    private void sniffRootDocument(EdgeUrl rootUrl) {
        try {
            logger.debug("Configuring link filter");

            var url = rootUrl.withPathAndParam("/", null);

-            var result = tryDownload(url, delayTimer, ContentTags.empty());
+            var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
            if (!(result instanceof HttpFetchResult.ResultOk ok))
                return;

@ -239,22 +245,28 @@ public class CrawlerRetreiver implements AutoCloseable {
    }

    public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
-                                                         CrawlDelayTimer timer,
-                                                         DocumentWithReference reference) {
+                                              CrawlDelayTimer timer,
+                                              DocumentWithReference reference) throws InterruptedException
+    {
        logger.debug("Fetching {}", top);

+        HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone();
+
        long startTime = System.currentTimeMillis();
-
        var contentTags = reference.getContentTags();
-        var fetchedDoc = tryDownload(top, timer, contentTags);

-        if (fetchedDoc instanceof HttpFetchResult.Result304Raw) {
-            var doc = reference.doc();
-            if (doc != null) {
-                warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody);
-                fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
-                        new ContentType(doc.contentType, "UTF-8"),
-                        doc.documentBody);
+        // Fetch the document, retrying if we get a rate limit exception
+        for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
+            try {
+                fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags);
+                break;
+            }
+            catch (RateLimitException ex) {
+                timer.waitRetryDelay(ex);
+            }
+            catch (Exception ex) {
+                logger.warn("Failed to fetch {}", top, ex);
+                fetchedDoc = new HttpFetchResult.ResultException(ex);
            }
        }

@ -268,14 +280,19 @@ public class CrawlerRetreiver implements AutoCloseable {
                    crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
                }
            }
-            else if (fetchedDoc instanceof HttpFetchResult.Result304ReplacedWithReference retained) {
-                var docOpt = retained.parseDocument();
-                if (docOpt.isPresent()) {
-                    var doc = docOpt.get();
+            else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
+                var doc = reference.doc();

-                    crawlFrontier.enqueueLinksFromDocument(top, doc);
-                    EdgeUrl.parse(retained.url()).ifPresent(crawlFrontier::addVisited);
-                }
+                warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody);
+
+                fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
+                        new ContentType(doc.contentType, "UTF-8"),
+                        doc.documentBody);
+
+                var parsed = Jsoup.parse(doc.documentBody);
+
+                crawlFrontier.enqueueLinksFromDocument(top, parsed);
+                crawlFrontier.addVisited(top);
            }
            else if (fetchedDoc instanceof HttpFetchResult.ResultException ex) {
                errorCount ++;
@ -285,7 +302,7 @@ public class CrawlerRetreiver implements AutoCloseable {
            logger.error("Error parsing document {}", top, ex);
        }

-        timer.delay(System.currentTimeMillis() - startTime);
+        timer.waitFetchDelay(System.currentTimeMillis() - startTime);

        return fetchedDoc;
    }
@ -295,33 +312,6 @@ public class CrawlerRetreiver implements AutoCloseable {
                || proto.equalsIgnoreCase("https");
    }

-    @SneakyThrows
-    private HttpFetchResult tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) {
-        for (int i = 0; i < 2; i++) {
-            try {
-                return fetcher.fetchContent(top, warcRecorder, tags);
-            }
-            catch (RateLimitException ex) {
-                timer.slowDown();
-
-                int delay = ex.retryAfter();
-                if (delay > 0 && delay < 5000) {
-                    Thread.sleep(delay);
-                }
-            }
-            catch (Exception ex) {
-                logger.warn("Failed to fetch {}", top, ex);
-                return new HttpFetchResult.ResultException(ex);
-            }
-        }
-
-        return new HttpFetchResult.ResultNone();
-    }
-
-    private String createHash(String documentBodyHash) {
-        return hashMethod.hashUnencodedChars(documentBodyHash).toString();
-    }
-
    // FIXME this does not belong in the crawler
    private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
        baseUrl = baseUrl.domain.toRootUrl();
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java
@ -34,6 +34,7 @@ public class CrawlerWarcResynchronizer {
        // First pass, enqueue links
        try (var reader = new WarcReader(tempFile)) {
            WarcXResponseReference.register(reader);
+            WarcXEntityRefused.register(reader);

            for (var item : reader) {
                accept(item);
@ -58,13 +59,26 @@ public class CrawlerWarcResynchronizer {
                response(rsp);
            } else if (item instanceof WarcRequest req) {
                request(req);
+            } else if (item instanceof WarcXEntityRefused refused) {
+                refused(refused);
            }
+
        }
        catch (Exception ex) {
            logger.info(STR."Failed to process warc record \{item}", ex);
        }
    }

+    private void refused(WarcXEntityRefused refused) {
+        // In general, we don't want to re-crawl urls that were refused,
+        // but to permit circumstances to change over  time, we'll
+        // allow for a small chance of re-probing these entries
+
+        if (Math.random() > 0.1) {
+            crawlFrontier.addVisited(new EdgeUrl(refused.targetURI()));
+        }
+    }
+
    private void request(WarcRequest request) {
        EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited);
    }
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
@ -50,9 +50,14 @@ public class DomainCrawlFrontier {
        }
    }

+    /** Increase the depth of the crawl by a factor.  If the current depth is smaller
+     * than the number of already visited documents, the base depth will be adjusted
+     * to the visited count first.
+     */
    public void increaseDepth(double depthIncreaseFactor) {
-        depth = (int)(depth * depthIncreaseFactor);
+        depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor);
    }
+
    public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
        this.linkFilter = linkFilter;
    }
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java
@ -20,7 +20,10 @@ public class WarcProtocolReconstructor {

    static String getHttpRequestString(Request request, URI uri) {
        StringBuilder requestStringBuilder = new StringBuilder();
-        requestStringBuilder.append(request.method()).append(" ").append(URLEncoder.encode(uri.getPath(), StandardCharsets.UTF_8));
+
+        final String encodedURL = encodeURLKeepSlashes(uri.getPath());
+
+        requestStringBuilder.append(request.method()).append(" ").append(encodedURL);

        if (uri.getQuery() != null) {
            requestStringBuilder.append("?").append(uri.getQuery());
@ -37,6 +40,19 @@ public class WarcProtocolReconstructor {
        return requestStringBuilder.toString();
    }

+    /** Java's URLEncoder will URLEncode slashes, which is not desirable
+     * when sanitizing a URL for HTTP protocol purposes
+     */
+
+    private static String encodeURLKeepSlashes(String URL) {
+        String[] parts = StringUtils.split(URL,"/");
+        StringJoiner joiner = new StringJoiner("/");
+        for (String part : parts) {
+            joiner.add(URLEncoder.encode(part, StandardCharsets.UTF_8));
+        }
+        return joiner.toString();
+    }
+
    static String getResponseHeader(String headersAsString, int code) {
        String version = "1.1";

@ -131,6 +147,11 @@ public class WarcProtocolReconstructor {
            if (headerCapitalized.startsWith("X-Marginalia"))
                return;

+            // Omit Transfer-Encoding header, as we'll be using Content-Length
+            // instead in the warc file, despite what the server says
+            if (headerCapitalized.startsWith("Transfer-Encoding"))
+                return;
+
            for (var value : values) {
                joiner.add(headerCapitalized + ": " + value);
            }
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java
@ -29,11 +29,6 @@ import java.util.*;
 * be reconstructed.
 */
 public class WarcRecorder implements AutoCloseable {
-    public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped");
-    public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe");
-    public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe");
-    public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error");
-
    private static final int MAX_TIME = 30_000;
    private static final int MAX_SIZE = 1024 * 1024 * 10;
    private final WarcWriter writer;
@ -91,6 +86,8 @@ public class WarcRecorder implements AutoCloseable {

        ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer();

+        boolean hasCookies = !client.cookieJar().loadForRequest(request.url()).isEmpty();
+
        try (var response = call.execute()) {
            var body = response.body();
            InputStream inputStream;
@ -143,6 +140,7 @@ public class WarcRecorder implements AutoCloseable {

            WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
                    .blockDigest(responseDigestBuilder.build())
+                    .addHeader("X-Has-Cookies", hasCookies ? "1" : "0")
                    .date(date)
                    .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());

@ -280,11 +278,11 @@ public class WarcRecorder implements AutoCloseable {

    public void flagAsRobotsTxtError(EdgeUrl top) {
        try {
-            WarcRevisit revisit = new WarcRevisit.Builder(top.asURI(), documentRobotsTxtSkippedURN)
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(top.asURI(), WarcXEntityRefused.documentRobotsTxtSkippedURN)
                    .date(Instant.now())
                    .build();

-            writer.write(revisit);
+            writer.write(refusal);
        } catch (URISyntaxException | IOException e) {
            throw new RuntimeException(e);
        }
@ -292,13 +290,13 @@ public class WarcRecorder implements AutoCloseable {

    public void flagAsFailedContentTypeProbe(EdgeUrl url, String contentType, int status) {
        try {
-            WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentBadContentTypeURN)
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentBadContentTypeURN)
                    .date(Instant.now())
                    .addHeader("Rejected-Content-Type", contentType)
                    .addHeader("Http-Status", Integer.toString(status))
                    .build();

-            writer.write(revisit);
+            writer.write(refusal);
        } catch (URISyntaxException | IOException e) {
            throw new RuntimeException(e);
        }
@ -306,13 +304,13 @@ public class WarcRecorder implements AutoCloseable {

    public void flagAsError(EdgeUrl url, Exception ex) {
        try {
-            WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentUnspecifiedError)
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentUnspecifiedError)
                    .date(Instant.now())
                    .addHeader("Exception", ex.getClass().getSimpleName())
                    .addHeader("ErrorMessage", Objects.requireNonNullElse(ex.getMessage(), ""))
                    .build();

-            writer.write(revisit);
+            writer.write(refusal);
        } catch (URISyntaxException | IOException e) {
            throw new RuntimeException(e);
        }
@ -320,11 +318,11 @@ public class WarcRecorder implements AutoCloseable {

    public void flagAsTimeout(EdgeUrl url) {
        try {
-            WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentProbeTimeout)
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentProbeTimeout)
                    .date(Instant.now())
                    .build();

-            writer.write(revisit);
+            writer.write(refusal);
        } catch (URISyntaxException | IOException e) {
            throw new RuntimeException(e);
        }
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@ -15,13 +15,6 @@ import org.jsoup.Jsoup;
 *  E-Tag and Last-Modified headers.
 */
 public class CrawlerRevisitor {
-    /** recrawlState tag for documents that had a HTTP status 304 */
-    public static final String documentWasRetainedTag = "RETAINED/304";
-
-    /** recrawlState tag for documents that had a 200 status but were identical to a previous version */
-    public static final String documentWasSameTag = "SAME-BY-COMPARISON";
-
-
    private final DomainCrawlFrontier crawlFrontier;
    private final CrawlerRetreiver crawlerRetreiver;
    private final WarcRecorder warcRecorder;
@ -37,7 +30,8 @@ public class CrawlerRevisitor {
    /** Performs a re-crawl of old documents, comparing etags and last-modified */
    public int recrawl(CrawlDataReference oldCrawlData,
                       SimpleRobotRules robotsRules,
-                       CrawlDelayTimer delayTimer) {
+                       CrawlDelayTimer delayTimer)
+    throws InterruptedException {
        int recrawled = 0;
        int retained = 0;