(converter) Fix issue where quality limits were no longer enforced
This commit is contained in:
parent
f15dd06473
commit
3fff7f6878
@ -122,9 +122,6 @@ public class DocumentValuator {
|
|||||||
if (quality + adjustment > 0) {
|
if (quality + adjustment > 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (quality + adjustment < -15) {
|
|
||||||
return -15;
|
|
||||||
}
|
|
||||||
|
|
||||||
return quality + adjustment;
|
return quality + adjustment;
|
||||||
}
|
}
|
||||||
|
@ -63,6 +63,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||||
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
|
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
|
||||||
|
|
||||||
|
private static final int MAX_DOCUMENT_LENGTH_BYTES = Integer.getInteger("converter.max-body-length",128_000);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public HtmlDocumentProcessorPlugin(
|
public HtmlDocumentProcessorPlugin(
|
||||||
@Named("min-document-quality") Double minDocumentQuality,
|
@Named("min-document-quality") Double minDocumentQuality,
|
||||||
@ -108,8 +110,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (documentBody.length() > 128_000) { // 128kb
|
if (documentBody.length() > MAX_DOCUMENT_LENGTH_BYTES) { // 128kb
|
||||||
documentBody = documentBody.substring(0, 128_000);
|
documentBody = documentBody.substring(0, MAX_DOCUMENT_LENGTH_BYTES);
|
||||||
}
|
}
|
||||||
|
|
||||||
Document doc = Jsoup.parse(documentBody);
|
Document doc = Jsoup.parse(documentBody);
|
||||||
@ -143,19 +145,18 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
ret.standard = standard;
|
ret.standard = standard;
|
||||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||||
|
|
||||||
// don't move this up! it uses title and quality
|
|
||||||
// and is run before the heavy computations below
|
|
||||||
documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier());
|
documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier());
|
||||||
|
|
||||||
if (isDisqualified(documentClass, url, ret)) {
|
|
||||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
|
||||||
}
|
|
||||||
|
|
||||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, dld);
|
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, dld);
|
||||||
|
|
||||||
ret.features = features;
|
ret.features = features;
|
||||||
ret.quality = documentValuator.adjustQuality(quality, features);
|
ret.quality = documentValuator.adjustQuality(quality, features);
|
||||||
ret.hashCode = dld.localitySensitiveHashCode();
|
ret.hashCode = dld.localitySensitiveHashCode();
|
||||||
|
|
||||||
|
if (isDisqualified(documentClass, url, quality, ret.title)) {
|
||||||
|
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||||
|
}
|
||||||
|
|
||||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
|
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
|
||||||
|
|
||||||
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
||||||
@ -211,16 +212,19 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
|
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
|
||||||
|
|
||||||
private boolean isDisqualified(DocumentClass documentClass, EdgeUrl url, ProcessedDocumentDetails ret) {
|
private boolean isDisqualified(DocumentClass documentClass,
|
||||||
|
EdgeUrl url,
|
||||||
|
double quality,
|
||||||
|
String title) {
|
||||||
|
|
||||||
if (documentClass.enforceQualityLimits()
|
if (documentClass.enforceQualityLimits()
|
||||||
&& ret.quality < minDocumentQuality)
|
&& quality < minDocumentQuality)
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// These pages shouldn't be publicly accessible
|
// These pages shouldn't be publicly accessible
|
||||||
if ("phpinfo()".equals(ret.title)) {
|
if ("phpinfo()".equals(title)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user