Fix a query variant creation bug that caused the search engine to sometimes drop important words from a query.

This commit is contained in:
vlofgren 2022-09-12 23:32:46 +02:00
parent 8c24ac761a
commit 10d1307dd6

View File

@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.search.query;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.Getter; import lombok.Getter;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
@ -54,7 +55,7 @@ public class QueryVariants {
public final String wordOriginal; public final String wordOriginal;
} }
@AllArgsConstructor @Getter @ToString @AllArgsConstructor @Getter @ToString @EqualsAndHashCode
public static class QueryVariant { public static class QueryVariant {
public final List<String> terms; public final List<String> terms;
public final double value; public final double value;
@ -97,12 +98,14 @@ public class QueryVariants {
var first = byStart.firstEntry(); var first = byStart.firstEntry();
if (first == null) { if (first == null) {
byStart.put(0, List.of(new WordSpan(0, sentence.length()))); var span = new WordSpan(0, sentence.length());
byStart.put(0, List.of(span));
} }
else if (first.getKey() > 0) { else if (first.getKey() > 0) {
List<WordSpan> elongatedFirstWords = new ArrayList<>(first.getValue().size()); List<WordSpan> elongatedFirstWords = new ArrayList<>(first.getValue().size());
first.getValue().forEach(span -> { first.getValue().forEach(span -> {
elongatedFirstWords.add(new WordSpan(0, span.start));
elongatedFirstWords.add(new WordSpan(0, span.end)); elongatedFirstWords.add(new WordSpan(0, span.end));
}); });
@ -142,8 +145,7 @@ public class QueryVariants {
QueryVariantSet returnValue = new QueryVariantSet(); QueryVariantSet returnValue = new QueryVariantSet();
returnValue.faithful.addAll(evaluateQueries(faithfulQueries)); returnValue.faithful.addAll(evaluateQueries(faithfulQueries));
returnValue.alternative.addAll(evaluateQueries(alternativeQueries));
returnValue.faithful.addAll(evaluateQueries(alternativeQueries));
returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue)); returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue));
returnValue.alternative.sort(Comparator.comparing(QueryVariant::getValue)); returnValue.alternative.sort(Comparator.comparing(QueryVariant::getValue));
@ -154,6 +156,7 @@ public class QueryVariants {
final Pattern underscore = Pattern.compile("_"); final Pattern underscore = Pattern.compile("_");
private List<QueryVariant> evaluateQueries(List<List<String>> queryStrings) { private List<QueryVariant> evaluateQueries(List<List<String>> queryStrings) {
Set<QueryVariant> variantsSet = new HashSet<>();
List<QueryVariant> ret = new ArrayList<>(); List<QueryVariant> ret = new ArrayList<>();
for (var lst : queryStrings) { for (var lst : queryStrings) {
double q = 0; double q = 0;
@ -165,7 +168,10 @@ public class QueryVariants {
} }
q += 1.0 / qp; q += 1.0 / qp;
} }
ret.add(new QueryVariant(lst, q)); var qv = new QueryVariant(lst, q);
if (variantsSet.add(qv)) {
ret.add(qv);
}
} }
return ret; return ret;
} }
@ -269,7 +275,7 @@ public class QueryVariants {
private List<List<Word>> getWordSpans(TreeMap<Integer, List<WordSpan>> byStart, DocumentSentence sentence, List<ArrayList<WordSpan>> livingSpans) { private List<List<Word>> getWordSpans(TreeMap<Integer, List<WordSpan>> byStart, DocumentSentence sentence, List<ArrayList<WordSpan>> livingSpans) {
List<List<Word>> goodSpans = new ArrayList<>(); List<List<Word>> goodSpans = new ArrayList<>();
for (int i = 0; i < sentence.length(); i++) { for (int i = 0; i < 1; i++) {
var spans = byStart.get(i); var spans = byStart.get(i);