(converter) Fix issue where quality limits were no longer enforced

This commit is contained in:
Viktor Lofgren 2024-01-23 11:42:17 +01:00
parent f15dd06473
commit 3fff7f6878
2 changed files with 15 additions and 14 deletions

View File

@ -122,9 +122,6 @@ public class DocumentValuator {
if (quality + adjustment > 0) {
return 0;
}
if (quality + adjustment < -15) {
return -15;
}
return quality + adjustment;
}

View File

@ -63,6 +63,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
private static final int MAX_DOCUMENT_LENGTH_BYTES = Integer.getInteger("converter.max-body-length",128_000);
@Inject
public HtmlDocumentProcessorPlugin(
@Named("min-document-quality") Double minDocumentQuality,
@ -108,8 +110,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
}
if (documentBody.length() > 128_000) { // 128kb
documentBody = documentBody.substring(0, 128_000);
if (documentBody.length() > MAX_DOCUMENT_LENGTH_BYTES) { // 128kb
documentBody = documentBody.substring(0, MAX_DOCUMENT_LENGTH_BYTES);
}
Document doc = Jsoup.parse(documentBody);
@ -143,19 +145,18 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.standard = standard;
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
// don't move this up! it uses title and quality
// and is run before the heavy computations below
documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier());
if (isDisqualified(documentClass, url, ret)) {
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, dld);
ret.features = features;
ret.quality = documentValuator.adjustQuality(quality, features);
ret.hashCode = dld.localitySensitiveHashCode();
if (isDisqualified(documentClass, url, quality, ret.title)) {
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
@ -211,16 +212,19 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
private boolean isDisqualified(DocumentClass documentClass, EdgeUrl url, ProcessedDocumentDetails ret) {
private boolean isDisqualified(DocumentClass documentClass,
EdgeUrl url,
double quality,
String title) {
if (documentClass.enforceQualityLimits()
&& ret.quality < minDocumentQuality)
&& quality < minDocumentQuality)
{
return true;
}
// These pages shouldn't be publicly accessible
if ("phpinfo()".equals(ret.title)) {
if ("phpinfo()".equals(title)) {
return true;
}