(crawler) Move content type/charset sniffing to a separate microlibrary

This functionality needs to be accessed by the WarcSideloader, which is in the converter. The resultant microlibrary is tiny, but I think in this case it's justifiable.
2023-12-07 15:16:37 +01:00 · 2023-12-07 15:16:37 +01:00 · 064265b0b9
commit 064265b0b9
parent 2d5d11645d
10 changed files with 203 additions and 47 deletions
--- a/code/features-crawl/content-type/build.gradle
+++ b/code/features-crawl/content-type/build.gradle
@ -0,0 +1,28 @@
+plugins {
+    id 'java'
+
+
+    id 'jvm-test-suite'
+}
+
+java {
+    toolchain {
+        languageVersion.set(JavaLanguageVersion.of(21))
+    }
+}
+
+dependencies {
+    implementation project(':code:common:model')
+    implementation libs.crawlercommons
+    implementation libs.notnull
+
+    implementation libs.bundles.gson
+    implementation libs.bundles.slf4j
+    testImplementation libs.bundles.slf4j.test
+
+    implementation libs.jsoup
+
+    testImplementation libs.bundles.slf4j.test
+    testImplementation libs.bundles.junit
+    testImplementation libs.mockito
+}
--- a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java
+++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java
@ -0,0 +1,9 @@
+package nu.marginalia.contenttype;
+
+/** Content type and charset of a document
+ * @param contentType The content type, e.g. "text/html"
+ * @param charset The charset, e.g. "UTF-8"
+ */
+public record ContentType(String contentType, String charset) {
+
+}
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java
@ -1,7 +1,8 @@
-package nu.marginalia.crawl.retreival.logic;
+package nu.marginalia.contenttype;

 import crawlercommons.mimetypes.MimeTypeDetector;
-import nu.marginalia.crawling.model.ContentType;
+import org.jetbrains.annotations.NotNull;
+import org.jetbrains.annotations.Nullable;
 import org.jsoup.Jsoup;

 import java.util.Arrays;
@ -11,28 +12,40 @@ public class ContentTypeParser {

    static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector();

-    public static ContentType parse(String contentType, byte[] data) {
-        return getContentTypeFromContentTypeString(contentType)
-                .or(() -> getContentTypeStringFromTag(data))
+    /** Parse the content type and charset from a content type header and/or the body of a document,
+     * best effort
+     */
+    public static ContentType parseContentType(
+            @Nullable String contentTypeHeader,
+            @NotNull byte[] body)
+    {
+        return getContentTypeFromContentTypeString(contentTypeHeader)
+                .or(() -> getContentTypeStringFromTag(body))
                .orElseGet(() -> {
-                    Optional<String> charset = getCharsetFromTag(data);
+                    Optional<String> charset = getCharsetFromTag(body);
                    return new ContentType(
-                            Optional.ofNullable(contentType)
-                                    .or(() -> Optional.ofNullable(mimeTypeDetector.detect(data)))
-                                    .orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1"));
+                            Optional.ofNullable(contentTypeHeader)
+                                    .or(() -> Optional.ofNullable(mimeTypeDetector.detect(body)))
+                                    .orElseGet(() -> ContentTypeParser.shittyMimeSniffer(body)), charset.orElse("ISO_8859_1"));
                });
    }

-    private static Optional<ContentType> getContentTypeFromContentTypeString(String contentType) {
-        if (contentType != null && contentType.contains(";")) {
-            var parts = contentType.split(";");
-            var content = parts[0].trim();
-            var extra = parts[1].trim();
-            if (extra.startsWith("charset=")) {
-                return Optional.of(new ContentType(content, extra.substring("charset=".length())));
-            }
-        }
-        return Optional.empty();
+    /** Parse the charset from a content type string. */
+    private static Optional<ContentType> getContentTypeFromContentTypeString(@Nullable String contentType) {
+        if (contentType == null)
+            return Optional.empty();
+
+        if (!contentType.contains(";"))
+            return Optional.empty();
+
+        var parts = contentType.split(";");
+        var content = parts[0].trim();
+        var extra = parts[1].trim();
+
+        if (!extra.startsWith("charset="))
+            return Optional.empty();
+
+        return Optional.of(new ContentType(content, extra.substring("charset=".length())));
    }

    private static String shittyMimeSniffer(byte[] data) {
@ -45,6 +58,7 @@ public class ContentTypeParser {

        String startStr = new String(Arrays.copyOf(data, Math.min(128, data.length))).trim().toLowerCase();
        if (startStr.contains("<!doctype html") || startStr.contains("<html")) {
+            // note we use contains here, since xhtml may be served with a <?xml-style header first
            return "text/html";
        }
        else {
--- a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/DocumentBodyToString.java
+++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/DocumentBodyToString.java
@ -0,0 +1,27 @@
+package nu.marginalia.contenttype;
+
+import java.nio.charset.*;
+
+public class DocumentBodyToString {
+
+    /** Get the string data from a document body, given the content type and charset */
+    public static String getStringData(ContentType type, byte[] data) {
+        Charset charset;
+        try {
+            charset = Charset.forName(type.charset());
+        }
+        catch (IllegalCharsetNameException ex) {
+            // Fall back to UTF-8 if we don't understand what this is.  It's *probably* fine? Maybe?
+            charset = StandardCharsets.UTF_8;
+        }
+        catch (UnsupportedCharsetException ex) {
+            // This is usually like Macintosh Latin
+            // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
+            //
+            // It's close enough to 8859-1 to serve
+            charset = StandardCharsets.ISO_8859_1;
+        }
+
+        return new String(data, charset);
+    }
+}
--- a/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/ContentTypeParserTest.java
+++ b/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/ContentTypeParserTest.java
@ -0,0 +1,50 @@
+package nu.marginalia.contenttype;
+
+import org.junit.jupiter.api.Test;
+
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+public class ContentTypeParserTest {
+
+     @Test
+     public void testParseContentTypeWithHeader() {
+         byte[] body = "<!DOCTYPE html><html><head><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
+         String contentTypeHeader = "text/html; charset=UTF-8";
+         ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
+         assertNotNull(result);
+         assertEquals("text/html", result.contentType());
+         assertEquals("UTF-8", result.charset());
+     }
+
+     @Test
+     public void testParseContentTypeWithMetaCharset() {
+         byte[] body = "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
+         ContentType result = ContentTypeParser.parseContentType(null, body);
+         assertNotNull(result);
+         assertEquals("text/html", result.contentType());
+         assertEquals("UTF-8", result.charset());
+     }
+
+     @Test
+     public void testParseContentTypeWithHeaderValueAbsent() {
+         byte[] body = "Some random text.".getBytes(StandardCharsets.UTF_8);
+         String contentTypeHeader = "text/plain";
+         ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
+         assertNotNull(result);
+         assertEquals("text/plain", result.contentType());
+         assertEquals("ISO_8859_1", result.charset());
+     }
+
+     @Test
+     public void testParseContentTypeWithBinaryData() {
+         byte[] body = new byte[128];
+         body[0] = 31; // ascii value less than 32
+         ContentType result = ContentTypeParser.parseContentType(null, body);
+         assertNotNull(result);
+         assertEquals("application/binary", result.contentType());
+         assertEquals("ISO_8859_1", result.charset());
+     }
+}
--- a/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java
+++ b/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java
@ -0,0 +1,48 @@
+package nu.marginalia.contenttype;
+
+import org.junit.jupiter.api.Test;
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.nio.charset.StandardCharsets;
+
+public class DocumentBodyToStringTest {
+    @Test
+    public void testGetStringData_onUTF8(){
+
+        ContentType type = new ContentType("text/html", "UTF-8");
+
+        String expected = "Hello, World!";
+        byte[] data = expected.getBytes(StandardCharsets.UTF_8);
+
+        String result = DocumentBodyToString.getStringData(type, data);
+
+        assertEquals(expected, result, "Result should match the expected string");
+    }
+
+    @Test
+    public void testGetStringData_onIllegalCharsetName(){
+
+        ContentType type = new ContentType("text/html", "unsupportedname");
+
+        String expected = "Hello, World!";
+        byte[] data = expected.getBytes(StandardCharsets.UTF_8);
+
+        String result = DocumentBodyToString.getStringData(type, data);
+
+        assertEquals(expected, result, "Result should match the expected string if charset is illegal name");
+    }
+
+    @Test
+    public void testGetStringData_onUnsupportedCharset(){
+
+        ContentType type = new ContentType("text/html", "Macintosh");
+
+        String expected = "Hello, World!";
+        byte[] data = expected.getBytes(StandardCharsets.UTF_8);
+
+        String result = DocumentBodyToString.getStringData(type, data);
+
+        assertEquals(expected, result, "Result should fall back to UTF-8 parsing if charset is unsupported");
+    }
+
+}
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java
@ -1,5 +0,0 @@
-package nu.marginalia.crawling.model;
-
-
-public record ContentType(String contentType, String charset) {
-}
--- a/code/processes/crawling-process/build.gradle
+++ b/code/processes/crawling-process/build.gradle
@ -41,6 +41,7 @@ dependencies {
    implementation project(':code:features-convert:anchor-keywords')
    implementation project(':code:features-crawl:crawl-blocklist')
    implementation project(':code:features-crawl:link-parser')
+    implementation project(':code:features-crawl:content-type')

    implementation libs.bundles.slf4j

--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
@ -5,17 +5,17 @@ import com.google.inject.name.Named;
 import crawlercommons.robots.SimpleRobotRules;
 import crawlercommons.robots.SimpleRobotRulesParser;
 import lombok.SneakyThrows;
+import nu.marginalia.contenttype.DocumentBodyToString;
 import nu.marginalia.crawl.retreival.Cookies;
 import nu.marginalia.crawl.retreival.RateLimitException;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.CrawlerDocumentStatus;
-import nu.marginalia.crawling.model.ContentType;
+import nu.marginalia.contenttype.ContentType;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
-import nu.marginalia.crawl.retreival.logic.ContentTypeParser;
+import nu.marginalia.contenttype.ContentTypeParser;
 import okhttp3.*;
-import org.apache.commons.collections4.queue.PredicatedQueue;
 import org.apache.commons.io.input.BOMInputStream;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
@ -281,7 +281,7 @@ public class HttpFetcherImpl implements HttpFetcher {

        byte[] data = byteStream.readNBytes(maxFetchSize);

-        var contentType = ContentTypeParser.parse(contentTypeHeader, data);
+        var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data);
        if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) {
            return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
        }
@ -301,7 +301,8 @@ public class HttpFetcherImpl implements HttpFetcher {
                    .build();
        }

-        var strData = getStringData(data, contentType);
+        var strData = DocumentBodyToString.getStringData(contentType, data);
+
        var canonical = rsp.header("rel=canonical", "");

        return CrawledDocument.builder()
@ -363,24 +364,6 @@ public class HttpFetcherImpl implements HttpFetcher {
        return isPermittedGeneral;
    }

-    private String getStringData(byte[] data, ContentType contentType) {
-        Charset charset;
-        try {
-            charset = Charset.forName(contentType.charset());
-        }
-        catch (IllegalCharsetNameException ex) {
-            charset = StandardCharsets.UTF_8;
-        }
-        catch (UnsupportedCharsetException ex) {
-            // This is usually like Macintosh Latin
-            // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
-            //
-            // It's close enough to 8859-1 to serve
-            charset = StandardCharsets.ISO_8859_1;
-        }
-        return new String(data, charset);
-    }
-
    private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) {

        return CrawledDocument.builder()
--- a/settings.gradle
+++ b/settings.gradle
@ -40,6 +40,7 @@ include 'code:features-convert:topic-detection'

 include 'code:features-crawl:crawl-blocklist'
 include 'code:features-crawl:link-parser'
+include 'code:features-crawl:content-type'

 include 'code:features-index:index-journal'
 include 'code:features-index:index-query'