(converter) Adjust which flags are set by anchor text keywords

It's a mistake to let it bleed into Title, as this is a high quality signal.  We'll co-opt Site and SiteAdjacent instead to reinforce the ExternalLink when count is high.
This commit is contained in:
Viktor Lofgren 2024-01-23 11:54:00 +01:00
parent 3fff7f6878
commit 0081328aca
2 changed files with 16 additions and 8 deletions

View File

@ -46,7 +46,7 @@ public class AnchorTextKeywords {
return ret;
}
public List<String> getAnchorTextKeywords(DomainLinks links, EdgeUrl url) {
public Map<String, Integer> getAnchorTextKeywords(DomainLinks links, EdgeUrl url) {
var keywordsRaw = links.forUrl(url);
// Extract and count keywords from anchor text
@ -62,10 +62,10 @@ public class AnchorTextKeywords {
}
// Filter out keywords that appear infrequently
final List<String> keywords = new ArrayList<>(wordsWithCount.size());
final Map<String, Integer> keywords = new HashMap<>(wordsWithCount.size());
for (var wordEntry : wordsWithCount.entrySet()) {
if (wordEntry.getValue() > 2) {
keywords.add(wordEntry.getKey());
keywords.put(wordEntry.getKey(), wordEntry.getValue());
}
}

View File

@ -75,12 +75,20 @@ public class DocumentKeywordsBuilder {
newWords.forEach(word -> words.putIfAbsent(word, meta));
}
public void addAnchorTerms(List<String> keywords) {
long meta = WordFlags.Title.asBit()
| WordFlags.ExternalLink.asBit()
| (1L << WordMetadata.POSITIONS_SHIFT);
public void addAnchorTerms(Map<String, Integer> keywords) {
long flagA = WordFlags.ExternalLink.asBit();
long flagB = flagA | WordFlags.Site.asBit();
long flagC = flagB | WordFlags.SiteAdjacent.asBit();
keywords.forEach(word -> words.mergeLong(word, meta, (a, b) -> a|b));
keywords.forEach((word, count) -> {
if (count > 5) {
words.mergeLong(word, flagC, (a, b) -> a|b);
} else if (count > 2) {
words.mergeLong(word, flagB, (a, b) -> a|b);
} else {
words.mergeLong(word, flagA, (a, b) -> a|b);
}
});
}
public List<String> getWordsWithAnyFlag(long flags) {