diff --git a/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/AnchorTextKeywords.java b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/AnchorTextKeywords.java index 524caaba..95e37836 100644 --- a/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/AnchorTextKeywords.java +++ b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/AnchorTextKeywords.java @@ -46,7 +46,7 @@ public class AnchorTextKeywords { return ret; } - public List getAnchorTextKeywords(DomainLinks links, EdgeUrl url) { + public Map getAnchorTextKeywords(DomainLinks links, EdgeUrl url) { var keywordsRaw = links.forUrl(url); // Extract and count keywords from anchor text @@ -62,10 +62,10 @@ public class AnchorTextKeywords { } // Filter out keywords that appear infrequently - final List keywords = new ArrayList<>(wordsWithCount.size()); + final Map keywords = new HashMap<>(wordsWithCount.size()); for (var wordEntry : wordsWithCount.entrySet()) { if (wordEntry.getValue() > 2) { - keywords.add(wordEntry.getKey()); + keywords.put(wordEntry.getKey(), wordEntry.getValue()); } } diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index aadb893d..49cf3914 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -75,12 +75,20 @@ public class DocumentKeywordsBuilder { newWords.forEach(word -> words.putIfAbsent(word, meta)); } - public void addAnchorTerms(List keywords) { - long meta = WordFlags.Title.asBit() - | WordFlags.ExternalLink.asBit() - | (1L << WordMetadata.POSITIONS_SHIFT); + public void addAnchorTerms(Map keywords) { + long flagA = WordFlags.ExternalLink.asBit(); + long flagB = flagA | WordFlags.Site.asBit(); + long flagC = flagB | WordFlags.SiteAdjacent.asBit(); - keywords.forEach(word -> words.mergeLong(word, meta, (a, b) -> a|b)); + keywords.forEach((word, count) -> { + if (count > 5) { + words.mergeLong(word, flagC, (a, b) -> a|b); + } else if (count > 2) { + words.mergeLong(word, flagB, (a, b) -> a|b); + } else { + words.mergeLong(word, flagA, (a, b) -> a|b); + } + }); } public List getWordsWithAnyFlag(long flags) {