(converter) Fix issue where quality limits were no longer enforced

This commit is contained in:
Viktor Lofgren 2024-01-23 11:42:17 +01:00
parent f15dd06473
commit 3fff7f6878
2 changed files with 15 additions and 14 deletions

View File

@ -122,9 +122,6 @@ public class DocumentValuator {
if (quality + adjustment > 0) { if (quality + adjustment > 0) {
return 0; return 0;
} }
if (quality + adjustment < -15) {
return -15;
}
return quality + adjustment; return quality + adjustment;
} }

View File

@ -63,6 +63,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider; private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final HtmlProcessorSpecializations htmlProcessorSpecializations; private final HtmlProcessorSpecializations htmlProcessorSpecializations;
private static final int MAX_DOCUMENT_LENGTH_BYTES = Integer.getInteger("converter.max-body-length",128_000);
@Inject @Inject
public HtmlDocumentProcessorPlugin( public HtmlDocumentProcessorPlugin(
@Named("min-document-quality") Double minDocumentQuality, @Named("min-document-quality") Double minDocumentQuality,
@ -108,8 +110,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
throw new DisqualifiedException(DisqualificationReason.LANGUAGE); throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
} }
if (documentBody.length() > 128_000) { // 128kb if (documentBody.length() > MAX_DOCUMENT_LENGTH_BYTES) { // 128kb
documentBody = documentBody.substring(0, 128_000); documentBody = documentBody.substring(0, MAX_DOCUMENT_LENGTH_BYTES);
} }
Document doc = Jsoup.parse(documentBody); Document doc = Jsoup.parse(documentBody);
@ -143,19 +145,18 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.standard = standard; ret.standard = standard;
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
// don't move this up! it uses title and quality
// and is run before the heavy computations below
documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier()); documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier());
if (isDisqualified(documentClass, url, ret)) {
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, dld); final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, dld);
ret.features = features; ret.features = features;
ret.quality = documentValuator.adjustQuality(quality, features); ret.quality = documentValuator.adjustQuality(quality, features);
ret.hashCode = dld.localitySensitiveHashCode(); ret.hashCode = dld.localitySensitiveHashCode();
if (isDisqualified(documentClass, url, quality, ret.title)) {
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true); PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type()); EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
@ -211,16 +212,19 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$"); private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
private boolean isDisqualified(DocumentClass documentClass, EdgeUrl url, ProcessedDocumentDetails ret) { private boolean isDisqualified(DocumentClass documentClass,
EdgeUrl url,
double quality,
String title) {
if (documentClass.enforceQualityLimits() if (documentClass.enforceQualityLimits()
&& ret.quality < minDocumentQuality) && quality < minDocumentQuality)
{ {
return true; return true;
} }
// These pages shouldn't be publicly accessible // These pages shouldn't be publicly accessible
if ("phpinfo()".equals(ret.title)) { if ("phpinfo()".equals(title)) {
return true; return true;
} }