(executor) Refine atag export logic
* Remove obviously uninteresting tags * Omit URL schema for more sensible sorting * Change the column order to put the source domain last
This commit is contained in:
parent
c77a5b7cb6
commit
a8b9d21f2d
@ -57,6 +57,7 @@ dependencies {
|
|||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.commons.io
|
implementation libs.commons.io
|
||||||
|
implementation libs.commons.lang3
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.bundles.mariadb
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
@ -10,6 +10,7 @@ import nu.marginalia.link_parser.LinkParser;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.storage.model.*;
|
import nu.marginalia.storage.model.*;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
@ -106,7 +107,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
|
private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
|
||||||
final TLongHashSet hashes = new TLongHashSet();
|
ATagLinkFilter linkFilter = new ATagLinkFilter();
|
||||||
|
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
if (!(stream.next() instanceof CrawledDocument doc))
|
if (!(stream.next() instanceof CrawledDocument doc))
|
||||||
@ -119,13 +120,14 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
for (var atag : parsed.getElementsByTag("a")) {
|
for (var atag : parsed.getElementsByTag("a")) {
|
||||||
String linkText = atag.text();
|
String linkText = atag.text();
|
||||||
if (linkText.isBlank())
|
|
||||||
|
if (!linkFilter.isLinkTextEligible(linkText)) {
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
|
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
|
||||||
linkOpt
|
linkOpt
|
||||||
.filter(url -> !Objects.equals(url.domain, baseUrl.domain))
|
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText))
|
||||||
.filter(url -> hashes.add(hash.hashNearlyASCII(linkText) ^ hash.hashNearlyASCII(url.toString())))
|
|
||||||
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
|
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -133,6 +135,55 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class ATagLinkFilter {
|
||||||
|
private final TLongHashSet hashes = new TLongHashSet();
|
||||||
|
|
||||||
|
private boolean isLinkTextEligible(String linkText) {
|
||||||
|
// Filter out the most obviously uninteresting anchor texts
|
||||||
|
|
||||||
|
if (linkText.isBlank())
|
||||||
|
return false;
|
||||||
|
if (linkText.startsWith("this"))
|
||||||
|
return false;
|
||||||
|
if (linkText.equalsIgnoreCase("here"))
|
||||||
|
return false;
|
||||||
|
if (linkText.equalsIgnoreCase("click here"))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (!StringUtils.isAsciiPrintable(linkText)) // This also filters out newlines, a good thing!
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
private boolean isEligible(EdgeUrl url, EdgeUrl baseUrl, String linkText) {
|
||||||
|
if (!"http".equals(url.proto) && !"https".equals(url.proto))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// This is an artifact of the link parser typically
|
||||||
|
if ("example.com".equals(url.domain.domain))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (linkText.contains(url.domain.toString()))
|
||||||
|
return false;
|
||||||
|
if (Objects.equals(url.domain, baseUrl.domain))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
String urlString = url.toString();
|
||||||
|
if (!StringUtils.isAsciiPrintable(urlString)) { // This also filters out newlines, a good thing!
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't
|
||||||
|
// need to be concerned about using the fast ASCII hash
|
||||||
|
if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static class ATagCsvWriter {
|
private static class ATagCsvWriter {
|
||||||
private final BufferedWriter writer;
|
private final BufferedWriter writer;
|
||||||
|
|
||||||
@ -141,17 +192,28 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void accept(EdgeUrl url, EdgeDomain domain, String linkText) {
|
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) {
|
||||||
|
final String urlString = urlWithNoSchema(url);
|
||||||
|
|
||||||
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
|
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
|
||||||
csvify(url),
|
csvify(urlString),
|
||||||
csvify(domain),
|
csvify(linkText),
|
||||||
csvify(linkText)));
|
csvify(sourceDomain)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String urlWithNoSchema(EdgeUrl url) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
sb.append(url.domain).append(url.path);
|
||||||
|
|
||||||
|
if (url.param != null)
|
||||||
|
sb.append('?').append(url.param);
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String csvify(Object field) {
|
private static String csvify(Object field) {
|
||||||
return field.toString()
|
return field.toString().replace("\"", "\"\"");
|
||||||
.replace("\"", "\"\"")
|
|
||||||
.replace("\n", " ");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user