(executor) Refine atag export logic

* Remove obviously uninteresting tags
* Omit URL schema for more sensible sorting
* Change the column order to put the source domain last
This commit is contained in:
Viktor Lofgren 2023-11-01 13:23:14 +01:00
parent c77a5b7cb6
commit a8b9d21f2d
2 changed files with 74 additions and 11 deletions

View File

@ -57,6 +57,7 @@ dependencies {
implementation libs.zstd implementation libs.zstd
implementation libs.jsoup implementation libs.jsoup
implementation libs.commons.io implementation libs.commons.io
implementation libs.commons.lang3
implementation libs.bundles.mariadb implementation libs.bundles.mariadb
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test

View File

@ -10,6 +10,7 @@ import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.storage.model.*; import nu.marginalia.storage.model.*;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.ActorStep;
@ -106,7 +107,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
} }
private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException { private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
final TLongHashSet hashes = new TLongHashSet(); ATagLinkFilter linkFilter = new ATagLinkFilter();
while (stream.hasNext()) { while (stream.hasNext()) {
if (!(stream.next() instanceof CrawledDocument doc)) if (!(stream.next() instanceof CrawledDocument doc))
@ -119,13 +120,14 @@ public class ExportAtagsActor extends RecordActorPrototype {
for (var atag : parsed.getElementsByTag("a")) { for (var atag : parsed.getElementsByTag("a")) {
String linkText = atag.text(); String linkText = atag.text();
if (linkText.isBlank())
if (!linkFilter.isLinkTextEligible(linkText)) {
continue; continue;
}
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag); var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
linkOpt linkOpt
.filter(url -> !Objects.equals(url.domain, baseUrl.domain)) .filter(url -> linkFilter.isEligible(url, baseUrl, linkText))
.filter(url -> hashes.add(hash.hashNearlyASCII(linkText) ^ hash.hashNearlyASCII(url.toString())))
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText)); .ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
} }
} }
@ -133,6 +135,55 @@ public class ExportAtagsActor extends RecordActorPrototype {
return true; return true;
} }
private static class ATagLinkFilter {
private final TLongHashSet hashes = new TLongHashSet();
private boolean isLinkTextEligible(String linkText) {
// Filter out the most obviously uninteresting anchor texts
if (linkText.isBlank())
return false;
if (linkText.startsWith("this"))
return false;
if (linkText.equalsIgnoreCase("here"))
return false;
if (linkText.equalsIgnoreCase("click here"))
return false;
if (!StringUtils.isAsciiPrintable(linkText)) // This also filters out newlines, a good thing!
return false;
return true;
}
private boolean isEligible(EdgeUrl url, EdgeUrl baseUrl, String linkText) {
if (!"http".equals(url.proto) && !"https".equals(url.proto))
return false;
// This is an artifact of the link parser typically
if ("example.com".equals(url.domain.domain))
return false;
if (linkText.contains(url.domain.toString()))
return false;
if (Objects.equals(url.domain, baseUrl.domain))
return false;
String urlString = url.toString();
if (!StringUtils.isAsciiPrintable(urlString)) { // This also filters out newlines, a good thing!
return false;
}
// Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't
// need to be concerned about using the fast ASCII hash
if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) {
return false;
}
return true;
}
}
private static class ATagCsvWriter { private static class ATagCsvWriter {
private final BufferedWriter writer; private final BufferedWriter writer;
@ -141,17 +192,28 @@ public class ExportAtagsActor extends RecordActorPrototype {
} }
@SneakyThrows @SneakyThrows
public void accept(EdgeUrl url, EdgeDomain domain, String linkText) { public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) {
final String urlString = urlWithNoSchema(url);
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n", writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
csvify(url), csvify(urlString),
csvify(domain), csvify(linkText),
csvify(linkText))); csvify(sourceDomain)));
}
private static String urlWithNoSchema(EdgeUrl url) {
StringBuilder sb = new StringBuilder();
sb.append(url.domain).append(url.path);
if (url.param != null)
sb.append('?').append(url.param);
return sb.toString();
} }
private static String csvify(Object field) { private static String csvify(Object field) {
return field.toString() return field.toString().replace("\"", "\"\"");
.replace("\"", "\"\"")
.replace("\n", " ");
} }
} }