(converter) Fix NPEs in converter due to the new data format
This commit is contained in:
parent
c488599879
commit
407915a86e
@ -8,6 +8,9 @@ import org.apache.commons.lang3.StringUtils;
|
||||
*/
|
||||
public record ContentType(String contentType, String charset) {
|
||||
public static ContentType parse(String contentTypeHeader) {
|
||||
if (contentTypeHeader == null || contentTypeHeader.isBlank())
|
||||
return new ContentType(null, null);
|
||||
|
||||
String[] parts = StringUtils.split(contentTypeHeader, ";", 2);
|
||||
String contentType = parts[0].trim();
|
||||
String charset = parts.length > 1 ? parts[1].trim() : "UTF-8";
|
||||
|
@ -8,7 +8,11 @@ public class DocumentBodyToString {
|
||||
public static String getStringData(ContentType type, byte[] data) {
|
||||
Charset charset;
|
||||
try {
|
||||
charset = Charset.forName(type.charset());
|
||||
if (type.charset() == null || type.charset().isBlank())
|
||||
charset = StandardCharsets.UTF_8;
|
||||
else {
|
||||
charset = Charset.forName(type.charset());
|
||||
}
|
||||
}
|
||||
catch (IllegalCharsetNameException ex) {
|
||||
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
|
||||
|
@ -100,7 +100,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
|
||||
return;
|
||||
}
|
||||
else {
|
||||
else if (nextRecord.body != null) {
|
||||
try {
|
||||
bodyString = DocumentBodyToString.getStringData(
|
||||
ContentType.parse(nextRecord.contentType),
|
||||
@ -110,6 +110,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
status = CrawlerDocumentStatus.BAD_CHARSET;
|
||||
}
|
||||
}
|
||||
else {
|
||||
status = CrawlerDocumentStatus.ERROR;
|
||||
}
|
||||
|
||||
nextQ.add(new CrawledDocument("",
|
||||
nextRecord.url,
|
||||
|
Loading…
Reference in New Issue
Block a user