Clean up feature extraction, fix misidentification of 'application/ld+json' as javascript.

This commit is contained in:
vlofgren 2022-08-23 00:48:48 +02:00
parent 6e2fdb7a77
commit 6fc72b3eb8

View File

@ -5,6 +5,8 @@ import com.google.inject.Singleton;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
@ -41,14 +43,17 @@ public class FeatureExtractor {
} }
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) { public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
Set<HtmlFeature> features = new HashSet<>(); final Set<HtmlFeature> features = new HashSet<>();
var scriptTags = doc.getElementsByTag("script"); final Elements scriptTags = doc.getElementsByTag("script");
if (scriptTags.size() > 0) { for (var scriptTag : scriptTags) {
features.add(HtmlFeature.JS); if (isJavascriptTag(scriptTag)) {
features.add(HtmlFeature.JS);
}
} }
else if(adblockSimulator.hasAds(doc.clone())) { // Only look for ads if there is javascript
if (features.contains(HtmlFeature.JS) && adblockSimulator.hasAds(doc.clone())) {
features.add(HtmlFeature.ADVERTISEMENT); features.add(HtmlFeature.ADVERTISEMENT);
} }
@ -58,20 +63,22 @@ public class FeatureExtractor {
features.add(HtmlFeature.MEDIA); features.add(HtmlFeature.MEDIA);
} }
if (scriptTags.stream() for (var scriptTag : scriptTags) {
.anyMatch(tag -> trackers.stream().anyMatch(tracker -> tag.attr("src").contains(tracker)))) { if (hasTrackingScript(scriptTag)) {
features.add(HtmlFeature.TRACKING); features.add(HtmlFeature.TRACKING);
break;
}
} }
if (scriptTags.html().contains("google-analytics.com")) { if (scriptTags.html().contains("google-analytics.com")) {
features.add(HtmlFeature.TRACKING); features.add(HtmlFeature.TRACKING);
} }
if (doc.getElementsByTag("a").stream().map(e -> e.attr("href")) for (var aTag : doc.getElementsByTag("a")) {
.map(String::toLowerCase) if (isAmazonAffiliateLink(aTag)) {
.anyMatch(href -> features.add(HtmlFeature.AFFILIATE_LINK);
href.contains("amzn.to/") || (href.contains("amazon.com/") & href.contains("tag=")))) { break;
features.add(HtmlFeature.AFFILIATE_LINK); }
} }
if (!domain.cookies.isEmpty()) { if (!domain.cookies.isEmpty()) {
@ -80,4 +87,34 @@ public class FeatureExtractor {
return features; return features;
} }
private boolean hasTrackingScript(Element scriptTag) {
for (var tracker : trackers) {
if (scriptTag.attr("src").contains(tracker)) {
return true;
}
}
return false;
}
private boolean isJavascriptTag(Element scriptTag) {
final String type = scriptTag.attr("type");
if ("application/ld+json".equalsIgnoreCase(type)) {
return false;
}
return true;
}
boolean isAmazonAffiliateLink(Element aTag) {
final String href = aTag.attr("href").toLowerCase();
if (href.contains("amzn.to/"))
return true;
if (href.contains("amazon.com/") && href.contains("tag="))
return true;
return false;
}
} }