(converter) Fix NPEs in converter due to the new data format

This commit is contained in:
Viktor Lofgren 2023-12-28 19:52:26 +01:00
parent c488599879
commit 407915a86e
3 changed files with 12 additions and 2 deletions

View File

@ -8,6 +8,9 @@ import org.apache.commons.lang3.StringUtils;
*/
public record ContentType(String contentType, String charset) {
public static ContentType parse(String contentTypeHeader) {
if (contentTypeHeader == null || contentTypeHeader.isBlank())
return new ContentType(null, null);
String[] parts = StringUtils.split(contentTypeHeader, ";", 2);
String contentType = parts[0].trim();
String charset = parts.length > 1 ? parts[1].trim() : "UTF-8";

View File

@ -8,8 +8,12 @@ public class DocumentBodyToString {
public static String getStringData(ContentType type, byte[] data) {
Charset charset;
try {
if (type.charset() == null || type.charset().isBlank())
charset = StandardCharsets.UTF_8;
else {
charset = Charset.forName(type.charset());
}
}
catch (IllegalCharsetNameException ex) {
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
charset = StandardCharsets.UTF_8;

View File

@ -100,7 +100,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
return;
}
else {
else if (nextRecord.body != null) {
try {
bodyString = DocumentBodyToString.getStringData(
ContentType.parse(nextRecord.contentType),
@ -110,6 +110,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
status = CrawlerDocumentStatus.BAD_CHARSET;
}
}
else {
status = CrawlerDocumentStatus.ERROR;
}
nextQ.add(new CrawledDocument("",
nextRecord.url,