From 4516b23f90806671dda7256c7891642f5f29f839 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 22 Jun 2022 13:12:44 +0200 Subject: [PATCH] Also grab alt text for images in a-tags in anchor text extractor --- .../converting/LinkKeywordExtractorMain.java | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index f60541e3..570c47b5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -19,6 +19,8 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -101,23 +103,42 @@ public class LinkKeywordExtractorMain { } } - private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+"); private void processDocument(String docUrl, String documentBody) throws URISyntaxException { - var processed = Jsoup.parse(documentBody); - - EdgeUrl documentUrl = new EdgeUrl(docUrl); + final Document processed = Jsoup.parse(documentBody); + final EdgeUrl documentUrl = new EdgeUrl(docUrl); for (var link : processed.getElementsByTag("a")) { if (link.hasAttr("href")) { String href = link.attr("href"); - String text = anchorTextNoise.matcher(link.text().toLowerCase()).replaceAll(" ").trim(); + String text = getLinkText(link); processAnchor(documentUrl, href, text); } } } + private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+"); + + private String getLinkText(Element link) { + String text = link.text(); + + if (link.text().isBlank()) { + text = getLinkTextByImgAltTag(link); + } + + return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim(); + } + + private String getLinkTextByImgAltTag(Element link) { + for (var img: link.getElementsByTag("img")) { + if (img.hasAttr("alt")) { + return img.attr("alt"); + } + } + return ""; + } + private void processAnchor(EdgeUrl documentUrl, String href, String text) { if (!isInterestingAnchorText(text)) { return;