Also grab alt text for images in a-tags in anchor text extractor

This commit is contained in:
vlofgren 2022-06-22 13:12:44 +02:00
parent 48e4aa3ee8
commit 4516b23f90

View File

@ -19,6 +19,8 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -101,23 +103,42 @@ public class LinkKeywordExtractorMain {
}
}
private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
var processed = Jsoup.parse(documentBody);
EdgeUrl documentUrl = new EdgeUrl(docUrl);
final Document processed = Jsoup.parse(documentBody);
final EdgeUrl documentUrl = new EdgeUrl(docUrl);
for (var link : processed.getElementsByTag("a")) {
if (link.hasAttr("href")) {
String href = link.attr("href");
String text = anchorTextNoise.matcher(link.text().toLowerCase()).replaceAll(" ").trim();
String text = getLinkText(link);
processAnchor(documentUrl, href, text);
}
}
}
private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
private String getLinkText(Element link) {
String text = link.text();
if (link.text().isBlank()) {
text = getLinkTextByImgAltTag(link);
}
return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
}
private String getLinkTextByImgAltTag(Element link) {
for (var img: link.getElementsByTag("img")) {
if (img.hasAttr("alt")) {
return img.attr("alt");
}
}
return "";
}
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
if (!isInterestingAnchorText(text)) {
return;