Clean up feature extraction, fix misidentification of 'application/ld+json' as javascript.
This commit is contained in:
parent
6e2fdb7a77
commit
6fc72b3eb8
@ -5,6 +5,8 @@ import com.google.inject.Singleton;
|
|||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -41,14 +43,17 @@ public class FeatureExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
|
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
|
||||||
Set<HtmlFeature> features = new HashSet<>();
|
final Set<HtmlFeature> features = new HashSet<>();
|
||||||
|
|
||||||
var scriptTags = doc.getElementsByTag("script");
|
final Elements scriptTags = doc.getElementsByTag("script");
|
||||||
|
|
||||||
if (scriptTags.size() > 0) {
|
for (var scriptTag : scriptTags) {
|
||||||
features.add(HtmlFeature.JS);
|
if (isJavascriptTag(scriptTag)) {
|
||||||
|
features.add(HtmlFeature.JS);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if(adblockSimulator.hasAds(doc.clone())) { // Only look for ads if there is javascript
|
|
||||||
|
if (features.contains(HtmlFeature.JS) && adblockSimulator.hasAds(doc.clone())) {
|
||||||
features.add(HtmlFeature.ADVERTISEMENT);
|
features.add(HtmlFeature.ADVERTISEMENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -58,20 +63,22 @@ public class FeatureExtractor {
|
|||||||
features.add(HtmlFeature.MEDIA);
|
features.add(HtmlFeature.MEDIA);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (scriptTags.stream()
|
for (var scriptTag : scriptTags) {
|
||||||
.anyMatch(tag -> trackers.stream().anyMatch(tracker -> tag.attr("src").contains(tracker)))) {
|
if (hasTrackingScript(scriptTag)) {
|
||||||
features.add(HtmlFeature.TRACKING);
|
features.add(HtmlFeature.TRACKING);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (scriptTags.html().contains("google-analytics.com")) {
|
if (scriptTags.html().contains("google-analytics.com")) {
|
||||||
features.add(HtmlFeature.TRACKING);
|
features.add(HtmlFeature.TRACKING);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (doc.getElementsByTag("a").stream().map(e -> e.attr("href"))
|
for (var aTag : doc.getElementsByTag("a")) {
|
||||||
.map(String::toLowerCase)
|
if (isAmazonAffiliateLink(aTag)) {
|
||||||
.anyMatch(href ->
|
features.add(HtmlFeature.AFFILIATE_LINK);
|
||||||
href.contains("amzn.to/") || (href.contains("amazon.com/") & href.contains("tag=")))) {
|
break;
|
||||||
features.add(HtmlFeature.AFFILIATE_LINK);
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!domain.cookies.isEmpty()) {
|
if (!domain.cookies.isEmpty()) {
|
||||||
@ -80,4 +87,34 @@ public class FeatureExtractor {
|
|||||||
|
|
||||||
return features;
|
return features;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean hasTrackingScript(Element scriptTag) {
|
||||||
|
for (var tracker : trackers) {
|
||||||
|
if (scriptTag.attr("src").contains(tracker)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isJavascriptTag(Element scriptTag) {
|
||||||
|
final String type = scriptTag.attr("type");
|
||||||
|
|
||||||
|
if ("application/ld+json".equalsIgnoreCase(type)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isAmazonAffiliateLink(Element aTag) {
|
||||||
|
final String href = aTag.attr("href").toLowerCase();
|
||||||
|
|
||||||
|
if (href.contains("amzn.to/"))
|
||||||
|
return true;
|
||||||
|
if (href.contains("amazon.com/") && href.contains("tag="))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user