(converter) Fix NPEs in converter due to the new data format
This commit is contained in:
parent
c488599879
commit
407915a86e
@ -8,6 +8,9 @@ import org.apache.commons.lang3.StringUtils;
|
|||||||
*/
|
*/
|
||||||
public record ContentType(String contentType, String charset) {
|
public record ContentType(String contentType, String charset) {
|
||||||
public static ContentType parse(String contentTypeHeader) {
|
public static ContentType parse(String contentTypeHeader) {
|
||||||
|
if (contentTypeHeader == null || contentTypeHeader.isBlank())
|
||||||
|
return new ContentType(null, null);
|
||||||
|
|
||||||
String[] parts = StringUtils.split(contentTypeHeader, ";", 2);
|
String[] parts = StringUtils.split(contentTypeHeader, ";", 2);
|
||||||
String contentType = parts[0].trim();
|
String contentType = parts[0].trim();
|
||||||
String charset = parts.length > 1 ? parts[1].trim() : "UTF-8";
|
String charset = parts.length > 1 ? parts[1].trim() : "UTF-8";
|
||||||
|
@ -8,8 +8,12 @@ public class DocumentBodyToString {
|
|||||||
public static String getStringData(ContentType type, byte[] data) {
|
public static String getStringData(ContentType type, byte[] data) {
|
||||||
Charset charset;
|
Charset charset;
|
||||||
try {
|
try {
|
||||||
|
if (type.charset() == null || type.charset().isBlank())
|
||||||
|
charset = StandardCharsets.UTF_8;
|
||||||
|
else {
|
||||||
charset = Charset.forName(type.charset());
|
charset = Charset.forName(type.charset());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
catch (IllegalCharsetNameException ex) {
|
catch (IllegalCharsetNameException ex) {
|
||||||
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
|
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
|
||||||
charset = StandardCharsets.UTF_8;
|
charset = StandardCharsets.UTF_8;
|
||||||
|
@ -100,7 +100,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
|
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else {
|
else if (nextRecord.body != null) {
|
||||||
try {
|
try {
|
||||||
bodyString = DocumentBodyToString.getStringData(
|
bodyString = DocumentBodyToString.getStringData(
|
||||||
ContentType.parse(nextRecord.contentType),
|
ContentType.parse(nextRecord.contentType),
|
||||||
@ -110,6 +110,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
status = CrawlerDocumentStatus.BAD_CHARSET;
|
status = CrawlerDocumentStatus.BAD_CHARSET;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
status = CrawlerDocumentStatus.ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
nextQ.add(new CrawledDocument("",
|
nextQ.add(new CrawledDocument("",
|
||||||
nextRecord.url,
|
nextRecord.url,
|
||||||
|
Loading…
Reference in New Issue
Block a user