Fix bug where results would sometimes be presented solely based on the fact that the document is important on the site in general, regardless of whether it's important to the document.

This commit is contained in:
Viktor Lofgren 2023-03-11 14:20:32 +01:00
parent 2e2916cebe
commit 4cec89da91
3 changed files with 30 additions and 18 deletions

View File

@ -53,13 +53,16 @@ public final class SearchResultKeywordScore {
public double termValue() {
double sum = 0;
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
if (hasTermFlag(WordFlags.Title)) {
sum -= 15;
}
if (hasTermFlag(WordFlags.Site)) {
if (hasTermFlag(WordFlags.Site) && positionBits != 0) {
sum -= 10;
} else if (hasTermFlag(WordFlags.SiteAdjacent)) {
} else if (hasTermFlag(WordFlags.SiteAdjacent) && positionBits != 0) {
sum -= 5;
}
@ -78,8 +81,6 @@ public final class SearchResultKeywordScore {
sum -= 5;
}
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
sum -= tfIdf / 10.;
sum -= Integer.bitCount(positionBits) / 3.;

View File

@ -73,7 +73,7 @@ public record WordMetadata(int tfIdf,
sb.append('[')
.append("tfidf=").append(tfIdf).append(", ")
.append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']');
sb.append(", flags=").append(flags).append(']');
sb.append(", flags=").append(flagSet()).append(']');
return sb.toString();
}
@ -101,4 +101,8 @@ public record WordMetadata(int tfIdf,
public EnumSet<WordFlags> flagSet() {
return WordFlags.decode(flags);
}
public int positionCount() {
return Integer.bitCount(positions);
}
}

View File

@ -138,25 +138,29 @@ public class SearchResultValuator {
}
private double calculateSingleTermBonus(SearchResultsKeywordSet set, double totalFactor) {
var theKeyword = set.iterator().next();
final var theKeyword = set.iterator().next();
if (theKeyword.wordMetadata.hasFlag(WordFlags.Title)) {
final var wordMetadata = theKeyword.wordMetadata;
final int posCount = wordMetadata.positionCount();
if (wordMetadata.hasFlag(WordFlags.Title)) {
return totalFactor * 0.5;
}
else if (theKeyword.wordMetadata.hasFlag(WordFlags.Subjects)) {
else if (wordMetadata.hasFlag(WordFlags.Subjects)) {
return totalFactor * 0.6;
}
else if (theKeyword.wordMetadata.hasFlag(WordFlags.SiteAdjacent)) {
else if (wordMetadata.hasFlag(WordFlags.SiteAdjacent) && posCount > 0) {
return totalFactor * 0.65;
}
else if (theKeyword.wordMetadata.hasFlag(WordFlags.Site)) {
else if (wordMetadata.hasFlag(WordFlags.Site) && posCount > 0) {
return totalFactor * 0.7;
}
if (theKeyword.wordMetadata.hasFlag(WordFlags.UrlDomain)) {
if (wordMetadata.hasFlag(WordFlags.UrlDomain)) {
return totalFactor * 0.8;
}
else if (theKeyword.wordMetadata.hasFlag(WordFlags.UrlPath)) {
else if (wordMetadata.hasFlag(WordFlags.UrlPath) && posCount > 2)
{
return totalFactor * 0.9;
}
@ -213,6 +217,8 @@ public class SearchResultValuator {
final double k = keyword.weight() / totalWeight;
int posCount = keyword.wordMetadata.positionCount();
EnumSet<WordFlags> flags = keyword.flags();
final boolean title = flags.contains(WordFlags.Title);
@ -235,11 +241,12 @@ public class SearchResultValuator {
}
}
if (site) {
f *= Math.pow(0.75, k);
}
else if (siteAdjacent) {
f *= Math.pow(0.8, k);
if (posCount != 0) {
if (site) {
f *= Math.pow(0.75, k);
} else if (siteAdjacent) {
f *= Math.pow(0.8, k);
}
}
if (subject) {
@ -249,7 +256,7 @@ public class SearchResultValuator {
if (urlDomain) {
f *= Math.pow(0.8, k);
}
else if (urlPath) {
else if (urlPath && posCount > 1) {
f *= Math.pow(0.9, k);
}