(crawler) Move content type/charset sniffing to a separate microlibrary
This functionality needs to be accessed by the WarcSideloader, which is in the converter. The resultant microlibrary is tiny, but I think in this case it's justifiable.
This commit is contained in:
parent
2d5d11645d
commit
064265b0b9
28
code/features-crawl/content-type/build.gradle
Normal file
28
code/features-crawl/content-type/build.gradle
Normal file
@ -0,0 +1,28 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(21))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation libs.crawlercommons
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.bundles.slf4j
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
|
||||
implementation libs.jsoup
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
@ -0,0 +1,9 @@
|
||||
package nu.marginalia.contenttype;
|
||||
|
||||
/** Content type and charset of a document
|
||||
* @param contentType The content type, e.g. "text/html"
|
||||
* @param charset The charset, e.g. "UTF-8"
|
||||
*/
|
||||
public record ContentType(String contentType, String charset) {
|
||||
|
||||
}
|
@ -1,7 +1,8 @@
|
||||
package nu.marginalia.crawl.retreival.logic;
|
||||
package nu.marginalia.contenttype;
|
||||
|
||||
import crawlercommons.mimetypes.MimeTypeDetector;
|
||||
import nu.marginalia.crawling.model.ContentType;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.util.Arrays;
|
||||
@ -11,28 +12,40 @@ public class ContentTypeParser {
|
||||
|
||||
static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector();
|
||||
|
||||
public static ContentType parse(String contentType, byte[] data) {
|
||||
return getContentTypeFromContentTypeString(contentType)
|
||||
.or(() -> getContentTypeStringFromTag(data))
|
||||
/** Parse the content type and charset from a content type header and/or the body of a document,
|
||||
* best effort
|
||||
*/
|
||||
public static ContentType parseContentType(
|
||||
@Nullable String contentTypeHeader,
|
||||
@NotNull byte[] body)
|
||||
{
|
||||
return getContentTypeFromContentTypeString(contentTypeHeader)
|
||||
.or(() -> getContentTypeStringFromTag(body))
|
||||
.orElseGet(() -> {
|
||||
Optional<String> charset = getCharsetFromTag(data);
|
||||
Optional<String> charset = getCharsetFromTag(body);
|
||||
return new ContentType(
|
||||
Optional.ofNullable(contentType)
|
||||
.or(() -> Optional.ofNullable(mimeTypeDetector.detect(data)))
|
||||
.orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1"));
|
||||
Optional.ofNullable(contentTypeHeader)
|
||||
.or(() -> Optional.ofNullable(mimeTypeDetector.detect(body)))
|
||||
.orElseGet(() -> ContentTypeParser.shittyMimeSniffer(body)), charset.orElse("ISO_8859_1"));
|
||||
});
|
||||
}
|
||||
|
||||
private static Optional<ContentType> getContentTypeFromContentTypeString(String contentType) {
|
||||
if (contentType != null && contentType.contains(";")) {
|
||||
var parts = contentType.split(";");
|
||||
var content = parts[0].trim();
|
||||
var extra = parts[1].trim();
|
||||
if (extra.startsWith("charset=")) {
|
||||
return Optional.of(new ContentType(content, extra.substring("charset=".length())));
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
/** Parse the charset from a content type string. */
|
||||
private static Optional<ContentType> getContentTypeFromContentTypeString(@Nullable String contentType) {
|
||||
if (contentType == null)
|
||||
return Optional.empty();
|
||||
|
||||
if (!contentType.contains(";"))
|
||||
return Optional.empty();
|
||||
|
||||
var parts = contentType.split(";");
|
||||
var content = parts[0].trim();
|
||||
var extra = parts[1].trim();
|
||||
|
||||
if (!extra.startsWith("charset="))
|
||||
return Optional.empty();
|
||||
|
||||
return Optional.of(new ContentType(content, extra.substring("charset=".length())));
|
||||
}
|
||||
|
||||
private static String shittyMimeSniffer(byte[] data) {
|
||||
@ -45,6 +58,7 @@ public class ContentTypeParser {
|
||||
|
||||
String startStr = new String(Arrays.copyOf(data, Math.min(128, data.length))).trim().toLowerCase();
|
||||
if (startStr.contains("<!doctype html") || startStr.contains("<html")) {
|
||||
// note we use contains here, since xhtml may be served with a <?xml-style header first
|
||||
return "text/html";
|
||||
}
|
||||
else {
|
@ -0,0 +1,27 @@
|
||||
package nu.marginalia.contenttype;
|
||||
|
||||
import java.nio.charset.*;
|
||||
|
||||
public class DocumentBodyToString {
|
||||
|
||||
/** Get the string data from a document body, given the content type and charset */
|
||||
public static String getStringData(ContentType type, byte[] data) {
|
||||
Charset charset;
|
||||
try {
|
||||
charset = Charset.forName(type.charset());
|
||||
}
|
||||
catch (IllegalCharsetNameException ex) {
|
||||
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
|
||||
charset = StandardCharsets.UTF_8;
|
||||
}
|
||||
catch (UnsupportedCharsetException ex) {
|
||||
// This is usually like Macintosh Latin
|
||||
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
|
||||
//
|
||||
// It's close enough to 8859-1 to serve
|
||||
charset = StandardCharsets.ISO_8859_1;
|
||||
}
|
||||
|
||||
return new String(data, charset);
|
||||
}
|
||||
}
|
@ -0,0 +1,50 @@
|
||||
package nu.marginalia.contenttype;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
public class ContentTypeParserTest {
|
||||
|
||||
@Test
|
||||
public void testParseContentTypeWithHeader() {
|
||||
byte[] body = "<!DOCTYPE html><html><head><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
|
||||
String contentTypeHeader = "text/html; charset=UTF-8";
|
||||
ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
|
||||
assertNotNull(result);
|
||||
assertEquals("text/html", result.contentType());
|
||||
assertEquals("UTF-8", result.charset());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseContentTypeWithMetaCharset() {
|
||||
byte[] body = "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
|
||||
ContentType result = ContentTypeParser.parseContentType(null, body);
|
||||
assertNotNull(result);
|
||||
assertEquals("text/html", result.contentType());
|
||||
assertEquals("UTF-8", result.charset());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseContentTypeWithHeaderValueAbsent() {
|
||||
byte[] body = "Some random text.".getBytes(StandardCharsets.UTF_8);
|
||||
String contentTypeHeader = "text/plain";
|
||||
ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
|
||||
assertNotNull(result);
|
||||
assertEquals("text/plain", result.contentType());
|
||||
assertEquals("ISO_8859_1", result.charset());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseContentTypeWithBinaryData() {
|
||||
byte[] body = new byte[128];
|
||||
body[0] = 31; // ascii value less than 32
|
||||
ContentType result = ContentTypeParser.parseContentType(null, body);
|
||||
assertNotNull(result);
|
||||
assertEquals("application/binary", result.contentType());
|
||||
assertEquals("ISO_8859_1", result.charset());
|
||||
}
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
package nu.marginalia.contenttype;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class DocumentBodyToStringTest {
|
||||
@Test
|
||||
public void testGetStringData_onUTF8(){
|
||||
|
||||
ContentType type = new ContentType("text/html", "UTF-8");
|
||||
|
||||
String expected = "Hello, World!";
|
||||
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
String result = DocumentBodyToString.getStringData(type, data);
|
||||
|
||||
assertEquals(expected, result, "Result should match the expected string");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetStringData_onIllegalCharsetName(){
|
||||
|
||||
ContentType type = new ContentType("text/html", "unsupportedname");
|
||||
|
||||
String expected = "Hello, World!";
|
||||
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
String result = DocumentBodyToString.getStringData(type, data);
|
||||
|
||||
assertEquals(expected, result, "Result should match the expected string if charset is illegal name");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetStringData_onUnsupportedCharset(){
|
||||
|
||||
ContentType type = new ContentType("text/html", "Macintosh");
|
||||
|
||||
String expected = "Hello, World!";
|
||||
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
String result = DocumentBodyToString.getStringData(type, data);
|
||||
|
||||
assertEquals(expected, result, "Result should fall back to UTF-8 parsing if charset is unsupported");
|
||||
}
|
||||
|
||||
}
|
@ -1,5 +0,0 @@
|
||||
package nu.marginalia.crawling.model;
|
||||
|
||||
|
||||
public record ContentType(String contentType, String charset) {
|
||||
}
|
@ -41,6 +41,7 @@ dependencies {
|
||||
implementation project(':code:features-convert:anchor-keywords')
|
||||
implementation project(':code:features-crawl:crawl-blocklist')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-crawl:content-type')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -5,17 +5,17 @@ import com.google.inject.name.Named;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.crawl.retreival.Cookies;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.crawling.model.ContentType;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
|
||||
import nu.marginalia.crawl.retreival.logic.ContentTypeParser;
|
||||
import nu.marginalia.contenttype.ContentTypeParser;
|
||||
import okhttp3.*;
|
||||
import org.apache.commons.collections4.queue.PredicatedQueue;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
@ -281,7 +281,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
byte[] data = byteStream.readNBytes(maxFetchSize);
|
||||
|
||||
var contentType = ContentTypeParser.parse(contentTypeHeader, data);
|
||||
var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data);
|
||||
if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) {
|
||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
||||
}
|
||||
@ -301,7 +301,8 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
.build();
|
||||
}
|
||||
|
||||
var strData = getStringData(data, contentType);
|
||||
var strData = DocumentBodyToString.getStringData(contentType, data);
|
||||
|
||||
var canonical = rsp.header("rel=canonical", "");
|
||||
|
||||
return CrawledDocument.builder()
|
||||
@ -363,24 +364,6 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
return isPermittedGeneral;
|
||||
}
|
||||
|
||||
private String getStringData(byte[] data, ContentType contentType) {
|
||||
Charset charset;
|
||||
try {
|
||||
charset = Charset.forName(contentType.charset());
|
||||
}
|
||||
catch (IllegalCharsetNameException ex) {
|
||||
charset = StandardCharsets.UTF_8;
|
||||
}
|
||||
catch (UnsupportedCharsetException ex) {
|
||||
// This is usually like Macintosh Latin
|
||||
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
|
||||
//
|
||||
// It's close enough to 8859-1 to serve
|
||||
charset = StandardCharsets.ISO_8859_1;
|
||||
}
|
||||
return new String(data, charset);
|
||||
}
|
||||
|
||||
private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) {
|
||||
|
||||
return CrawledDocument.builder()
|
||||
|
@ -40,6 +40,7 @@ include 'code:features-convert:topic-detection'
|
||||
|
||||
include 'code:features-crawl:crawl-blocklist'
|
||||
include 'code:features-crawl:link-parser'
|
||||
include 'code:features-crawl:content-type'
|
||||
|
||||
include 'code:features-index:index-journal'
|
||||
include 'code:features-index:index-query'
|
||||
|
Loading…
Reference in New Issue
Block a user