From 4cec89da91c390ff530e786f65103ecb1190ddc6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 11 Mar 2023 14:20:32 +0100 Subject: [PATCH] Fix bug where results would sometimes be presented solely based on the fact that the document is important on the site in general, regardless of whether it's important to the document. --- .../results/SearchResultKeywordScore.java | 9 ++--- .../nu/marginalia/model/idx/WordMetadata.java | 6 +++- .../valuation/SearchResultValuator.java | 33 +++++++++++-------- 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java index 9e08ba35..ef286613 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java @@ -53,13 +53,16 @@ public final class SearchResultKeywordScore { public double termValue() { double sum = 0; + double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata); + int positionBits = WordMetadata.decodePositions(encodedWordMetadata); + if (hasTermFlag(WordFlags.Title)) { sum -= 15; } - if (hasTermFlag(WordFlags.Site)) { + if (hasTermFlag(WordFlags.Site) && positionBits != 0) { sum -= 10; - } else if (hasTermFlag(WordFlags.SiteAdjacent)) { + } else if (hasTermFlag(WordFlags.SiteAdjacent) && positionBits != 0) { sum -= 5; } @@ -78,8 +81,6 @@ public final class SearchResultKeywordScore { sum -= 5; } - double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata); - int positionBits = WordMetadata.decodePositions(encodedWordMetadata); sum -= tfIdf / 10.; sum -= Integer.bitCount(positionBits) / 3.; diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java b/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java index 511563f3..e07cbcbb 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java @@ -73,7 +73,7 @@ public record WordMetadata(int tfIdf, sb.append('[') .append("tfidf=").append(tfIdf).append(", ") .append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']'); - sb.append(", flags=").append(flags).append(']'); + sb.append(", flags=").append(flagSet()).append(']'); return sb.toString(); } @@ -101,4 +101,8 @@ public record WordMetadata(int tfIdf, public EnumSet flagSet() { return WordFlags.decode(flags); } + + public int positionCount() { + return Integer.bitCount(positions); + } } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java index 46fb0cb5..8055dcf8 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java @@ -138,25 +138,29 @@ public class SearchResultValuator { } private double calculateSingleTermBonus(SearchResultsKeywordSet set, double totalFactor) { - var theKeyword = set.iterator().next(); + final var theKeyword = set.iterator().next(); - if (theKeyword.wordMetadata.hasFlag(WordFlags.Title)) { + final var wordMetadata = theKeyword.wordMetadata; + final int posCount = wordMetadata.positionCount(); + + if (wordMetadata.hasFlag(WordFlags.Title)) { return totalFactor * 0.5; } - else if (theKeyword.wordMetadata.hasFlag(WordFlags.Subjects)) { + else if (wordMetadata.hasFlag(WordFlags.Subjects)) { return totalFactor * 0.6; } - else if (theKeyword.wordMetadata.hasFlag(WordFlags.SiteAdjacent)) { + else if (wordMetadata.hasFlag(WordFlags.SiteAdjacent) && posCount > 0) { return totalFactor * 0.65; } - else if (theKeyword.wordMetadata.hasFlag(WordFlags.Site)) { + else if (wordMetadata.hasFlag(WordFlags.Site) && posCount > 0) { return totalFactor * 0.7; } - if (theKeyword.wordMetadata.hasFlag(WordFlags.UrlDomain)) { + if (wordMetadata.hasFlag(WordFlags.UrlDomain)) { return totalFactor * 0.8; } - else if (theKeyword.wordMetadata.hasFlag(WordFlags.UrlPath)) { + else if (wordMetadata.hasFlag(WordFlags.UrlPath) && posCount > 2) + { return totalFactor * 0.9; } @@ -213,6 +217,8 @@ public class SearchResultValuator { final double k = keyword.weight() / totalWeight; + int posCount = keyword.wordMetadata.positionCount(); + EnumSet flags = keyword.flags(); final boolean title = flags.contains(WordFlags.Title); @@ -235,11 +241,12 @@ public class SearchResultValuator { } } - if (site) { - f *= Math.pow(0.75, k); - } - else if (siteAdjacent) { - f *= Math.pow(0.8, k); + if (posCount != 0) { + if (site) { + f *= Math.pow(0.75, k); + } else if (siteAdjacent) { + f *= Math.pow(0.8, k); + } } if (subject) { @@ -249,7 +256,7 @@ public class SearchResultValuator { if (urlDomain) { f *= Math.pow(0.8, k); } - else if (urlPath) { + else if (urlPath && posCount > 1) { f *= Math.pow(0.9, k); }