(crawler) Move content type/charset sniffing to a separate microlibrary

This functionality needs to be accessed by the WarcSideloader, which is in the converter.  The resultant microlibrary is tiny, but I think in this case it's justifiable.
This commit is contained in:
Viktor Lofgren 2023-12-07 15:16:37 +01:00
parent 2d5d11645d
commit 064265b0b9
10 changed files with 203 additions and 47 deletions

View File

@ -0,0 +1,28 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}
dependencies {
implementation project(':code:common:model')
implementation libs.crawlercommons
implementation libs.notnull
implementation libs.bundles.gson
implementation libs.bundles.slf4j
testImplementation libs.bundles.slf4j.test
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -0,0 +1,9 @@
package nu.marginalia.contenttype;
/** Content type and charset of a document
* @param contentType The content type, e.g. "text/html"
* @param charset The charset, e.g. "UTF-8"
*/
public record ContentType(String contentType, String charset) {
}

View File

@ -1,7 +1,8 @@
package nu.marginalia.crawl.retreival.logic;
package nu.marginalia.contenttype;
import crawlercommons.mimetypes.MimeTypeDetector;
import nu.marginalia.crawling.model.ContentType;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.jsoup.Jsoup;
import java.util.Arrays;
@ -11,28 +12,40 @@ public class ContentTypeParser {
static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector();
public static ContentType parse(String contentType, byte[] data) {
return getContentTypeFromContentTypeString(contentType)
.or(() -> getContentTypeStringFromTag(data))
/** Parse the content type and charset from a content type header and/or the body of a document,
* best effort
*/
public static ContentType parseContentType(
@Nullable String contentTypeHeader,
@NotNull byte[] body)
{
return getContentTypeFromContentTypeString(contentTypeHeader)
.or(() -> getContentTypeStringFromTag(body))
.orElseGet(() -> {
Optional<String> charset = getCharsetFromTag(data);
Optional<String> charset = getCharsetFromTag(body);
return new ContentType(
Optional.ofNullable(contentType)
.or(() -> Optional.ofNullable(mimeTypeDetector.detect(data)))
.orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1"));
Optional.ofNullable(contentTypeHeader)
.or(() -> Optional.ofNullable(mimeTypeDetector.detect(body)))
.orElseGet(() -> ContentTypeParser.shittyMimeSniffer(body)), charset.orElse("ISO_8859_1"));
});
}
private static Optional<ContentType> getContentTypeFromContentTypeString(String contentType) {
if (contentType != null && contentType.contains(";")) {
var parts = contentType.split(";");
var content = parts[0].trim();
var extra = parts[1].trim();
if (extra.startsWith("charset=")) {
return Optional.of(new ContentType(content, extra.substring("charset=".length())));
}
}
return Optional.empty();
/** Parse the charset from a content type string. */
private static Optional<ContentType> getContentTypeFromContentTypeString(@Nullable String contentType) {
if (contentType == null)
return Optional.empty();
if (!contentType.contains(";"))
return Optional.empty();
var parts = contentType.split(";");
var content = parts[0].trim();
var extra = parts[1].trim();
if (!extra.startsWith("charset="))
return Optional.empty();
return Optional.of(new ContentType(content, extra.substring("charset=".length())));
}
private static String shittyMimeSniffer(byte[] data) {
@ -45,6 +58,7 @@ public class ContentTypeParser {
String startStr = new String(Arrays.copyOf(data, Math.min(128, data.length))).trim().toLowerCase();
if (startStr.contains("<!doctype html") || startStr.contains("<html")) {
// note we use contains here, since xhtml may be served with a <?xml-style header first
return "text/html";
}
else {

View File

@ -0,0 +1,27 @@
package nu.marginalia.contenttype;
import java.nio.charset.*;
public class DocumentBodyToString {
/** Get the string data from a document body, given the content type and charset */
public static String getStringData(ContentType type, byte[] data) {
Charset charset;
try {
charset = Charset.forName(type.charset());
}
catch (IllegalCharsetNameException ex) {
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
charset = StandardCharsets.UTF_8;
}
catch (UnsupportedCharsetException ex) {
// This is usually like Macintosh Latin
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
//
// It's close enough to 8859-1 to serve
charset = StandardCharsets.ISO_8859_1;
}
return new String(data, charset);
}
}

View File

@ -0,0 +1,50 @@
package nu.marginalia.contenttype;
import org.junit.jupiter.api.Test;
import java.nio.charset.StandardCharsets;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
public class ContentTypeParserTest {
@Test
public void testParseContentTypeWithHeader() {
byte[] body = "<!DOCTYPE html><html><head><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
String contentTypeHeader = "text/html; charset=UTF-8";
ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
assertNotNull(result);
assertEquals("text/html", result.contentType());
assertEquals("UTF-8", result.charset());
}
@Test
public void testParseContentTypeWithMetaCharset() {
byte[] body = "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
ContentType result = ContentTypeParser.parseContentType(null, body);
assertNotNull(result);
assertEquals("text/html", result.contentType());
assertEquals("UTF-8", result.charset());
}
@Test
public void testParseContentTypeWithHeaderValueAbsent() {
byte[] body = "Some random text.".getBytes(StandardCharsets.UTF_8);
String contentTypeHeader = "text/plain";
ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
assertNotNull(result);
assertEquals("text/plain", result.contentType());
assertEquals("ISO_8859_1", result.charset());
}
@Test
public void testParseContentTypeWithBinaryData() {
byte[] body = new byte[128];
body[0] = 31; // ascii value less than 32
ContentType result = ContentTypeParser.parseContentType(null, body);
assertNotNull(result);
assertEquals("application/binary", result.contentType());
assertEquals("ISO_8859_1", result.charset());
}
}

View File

@ -0,0 +1,48 @@
package nu.marginalia.contenttype;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
import java.nio.charset.StandardCharsets;
public class DocumentBodyToStringTest {
@Test
public void testGetStringData_onUTF8(){
ContentType type = new ContentType("text/html", "UTF-8");
String expected = "Hello, World!";
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
String result = DocumentBodyToString.getStringData(type, data);
assertEquals(expected, result, "Result should match the expected string");
}
@Test
public void testGetStringData_onIllegalCharsetName(){
ContentType type = new ContentType("text/html", "unsupportedname");
String expected = "Hello, World!";
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
String result = DocumentBodyToString.getStringData(type, data);
assertEquals(expected, result, "Result should match the expected string if charset is illegal name");
}
@Test
public void testGetStringData_onUnsupportedCharset(){
ContentType type = new ContentType("text/html", "Macintosh");
String expected = "Hello, World!";
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
String result = DocumentBodyToString.getStringData(type, data);
assertEquals(expected, result, "Result should fall back to UTF-8 parsing if charset is unsupported");
}
}

View File

@ -1,5 +0,0 @@
package nu.marginalia.crawling.model;
public record ContentType(String contentType, String charset) {
}

View File

@ -41,6 +41,7 @@ dependencies {
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-crawl:content-type')
implementation libs.bundles.slf4j

View File

@ -5,17 +5,17 @@ import com.google.inject.name.Named;
import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser;
import lombok.SneakyThrows;
import nu.marginalia.contenttype.DocumentBodyToString;
import nu.marginalia.crawl.retreival.Cookies;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.crawling.model.ContentType;
import nu.marginalia.contenttype.ContentType;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
import nu.marginalia.crawl.retreival.logic.ContentTypeParser;
import nu.marginalia.contenttype.ContentTypeParser;
import okhttp3.*;
import org.apache.commons.collections4.queue.PredicatedQueue;
import org.apache.commons.io.input.BOMInputStream;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
@ -281,7 +281,7 @@ public class HttpFetcherImpl implements HttpFetcher {
byte[] data = byteStream.readNBytes(maxFetchSize);
var contentType = ContentTypeParser.parse(contentTypeHeader, data);
var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data);
if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
}
@ -301,7 +301,8 @@ public class HttpFetcherImpl implements HttpFetcher {
.build();
}
var strData = getStringData(data, contentType);
var strData = DocumentBodyToString.getStringData(contentType, data);
var canonical = rsp.header("rel=canonical", "");
return CrawledDocument.builder()
@ -363,24 +364,6 @@ public class HttpFetcherImpl implements HttpFetcher {
return isPermittedGeneral;
}
private String getStringData(byte[] data, ContentType contentType) {
Charset charset;
try {
charset = Charset.forName(contentType.charset());
}
catch (IllegalCharsetNameException ex) {
charset = StandardCharsets.UTF_8;
}
catch (UnsupportedCharsetException ex) {
// This is usually like Macintosh Latin
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
//
// It's close enough to 8859-1 to serve
charset = StandardCharsets.ISO_8859_1;
}
return new String(data, charset);
}
private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) {
return CrawledDocument.builder()

View File

@ -40,6 +40,7 @@ include 'code:features-convert:topic-detection'
include 'code:features-crawl:crawl-blocklist'
include 'code:features-crawl:link-parser'
include 'code:features-crawl:content-type'
include 'code:features-index:index-journal'
include 'code:features-index:index-query'