(forward-index, valuator) HTML features in valuator

Put it in the forward index for easy access during index-side valuation.
This commit is contained in:
Viktor Lofgren 2023-08-18 11:54:56 +02:00
parent fcfe07fb7d
commit 704de50a9b
27 changed files with 167 additions and 48 deletions

View File

@ -2,7 +2,6 @@ package nu.marginalia.index.client.model.results;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import java.util.Objects;
@ -14,15 +13,19 @@ public final class SearchResultKeywordScore {
private final long encodedDocMetadata;
private final boolean hasPriorityTerms;
private final int htmlFeatures;
public SearchResultKeywordScore(int subquery,
String keyword,
long encodedWordMetadata,
long encodedDocMetadata,
int htmlFeatures,
boolean hasPriorityTerms) {
this.subquery = subquery;
this.keyword = keyword;
this.encodedWordMetadata = encodedWordMetadata;
this.encodedDocMetadata = encodedDocMetadata;
this.htmlFeatures = htmlFeatures;
this.hasPriorityTerms = hasPriorityTerms;
}
@ -58,6 +61,10 @@ public final class SearchResultKeywordScore {
return encodedDocMetadata;
}
public int htmlFeatures() {
return htmlFeatures;
}
public boolean hasPriorityTerms() {
return hasPriorityTerms;
}

View File

@ -88,8 +88,9 @@ public class ForwardIndexConverter {
int ranking = domainRankings.getRanking(entry.domainId());
long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures());
});
progress.progress(TaskSteps.FORCE);

View File

@ -1,8 +1,9 @@
package nu.marginalia.index.forward;
class ForwardIndexParameters {
public static final int ENTRY_SIZE = 2;
public static final int ENTRY_SIZE = 3;
public static final int DOMAIN_OFFSET = 0;
public static final int METADATA_OFFSET = 1;
public static final int FEATURES_OFFSET = 2;
}

View File

@ -77,6 +77,13 @@ public class ForwardIndexReader {
return data.get(ENTRY_SIZE * offset + METADATA_OFFSET);
}
public int getHtmlFeatures(long docId) {
long offset = idxForDoc(docId);
if (offset < 0) return 0;
return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
}
public int getDomainId(long docId) {
long offset = idxForDoc(docId);
if (offset < 0) return 0;

View File

@ -7,7 +7,7 @@ import nu.marginalia.model.id.EdgeId;
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {
public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) {
return new IndexJournalEntryBuilder(documentId, documentMeta);
return new IndexJournalEntryBuilder(0, documentId, documentMeta);
}
public static IndexJournalEntryBuilder builder(int domainId,
@ -15,7 +15,9 @@ public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntr
long documentMeta) {
return builder(new EdgeId<>(domainId), new EdgeId<>(urlId), documentMeta);
return builder(new EdgeId<>(domainId),
new EdgeId<>(urlId),
documentMeta);
}
public static IndexJournalEntryBuilder builder(EdgeId<EdgeDomain> domainId,
@ -23,6 +25,8 @@ public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntr
long documentMeta) {
return new IndexJournalEntryBuilder(IndexJournalEntryHeader.combineIds(domainId, urlId), documentMeta);
return new IndexJournalEntryBuilder(0,
IndexJournalEntryHeader.combineIds(domainId, urlId),
documentMeta);
}
}

View File

@ -4,10 +4,15 @@ import gnu.trove.list.array.TLongArrayList;
public class IndexJournalEntryBuilder {
private final long documentId;
private final int documentFeatures;
private final long documentMeta;
private final TLongArrayList items = new TLongArrayList();
public IndexJournalEntryBuilder(long documentId, long documentMeta) {
public IndexJournalEntryBuilder(
int documentFeatures,
long documentId,
long documentMeta) {
this.documentFeatures = documentFeatures;
this.documentId = documentId;
this.documentMeta = documentMeta;
}
@ -22,7 +27,10 @@ public class IndexJournalEntryBuilder {
public IndexJournalEntry build() {
return new IndexJournalEntry(
new IndexJournalEntryHeader(items.size(), documentId, documentMeta),
new IndexJournalEntryHeader(items.size(),
documentFeatures,
documentId,
documentMeta),
new IndexJournalEntryData(items.toArray())
);
}

View File

@ -4,10 +4,19 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
public record IndexJournalEntryHeader(int entrySize, long combinedId, long documentMeta) {
public record IndexJournalEntryHeader(int entrySize,
int documentFeatures,
long combinedId,
long documentMeta) {
public IndexJournalEntryHeader(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId, long documentMeta) {
this(-1, combineIds(domainId, urlId), documentMeta);
public IndexJournalEntryHeader(EdgeId<EdgeDomain> domainId,
int documentFeatures,
EdgeId<EdgeUrl> urlId,
long documentMeta) {
this(-1,
documentFeatures,
combineIds(domainId, urlId),
documentMeta);
}
static long combineIds(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId) {

View File

@ -30,6 +30,7 @@ public class IndexJournalReadEntry {
var header = new IndexJournalEntryHeader(
(int) (sizeBlock >>> 32L),
(int) (sizeBlock & 0xFFFF_FFFFL),
docId,
meta);

View File

@ -72,7 +72,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
}
dataBuffer.putInt(entry.size());
dataBuffer.putInt(0);
dataBuffer.putInt(header.documentFeatures());
dataBuffer.putLong(header.combinedId());
dataBuffer.putLong(header.documentMeta());

View File

@ -106,7 +106,7 @@ class ReverseIndexFullConverterTest2 {
}
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
int[] factors = getFactorsI(id);
var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5);
var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5);
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {

View File

@ -106,7 +106,7 @@ class ReverseIndexPriorityConverterTest2 {
}
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
int[] factors = getFactorsI(id);
var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5);
var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5);
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {

View File

@ -5,6 +5,7 @@ import nu.marginalia.index.client.model.results.ResultRankingParameters;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.ranking.factors.*;
@ -48,19 +49,20 @@ public class ResultValuator {
double bestScore = 10;
long documentMetadata = documentMetadata(scores);
int features = htmlFeatures(scores);
var rankingParams = ctx.params;
int rank = DocumentMetadata.decodeRank(documentMetadata);
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
int quality = DocumentMetadata.decodeQuality(documentMetadata);
int urlTypePenalty = getUrlTypePenalty(documentMetadata);
int size = DocumentMetadata.decodeSize(documentMetadata);
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size, quality);
int topology = DocumentMetadata.decodeTopology(documentMetadata);
int year = DocumentMetadata.decodeYear(documentMetadata);
double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty);
final double qualityPenalty = -quality * rankingParams.qualityPenalty;
final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus;
final double topologyBonus = Math.log(1 + topology);
final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty;
@ -80,7 +82,7 @@ public class ResultValuator {
+ rankingBonus
+ topologyBonus
+ temporalBias
+ urlTypePenalty
+ flagsPenalty
+ priorityTermBonus.calculate(scores);
for (int set = 0; set <= sets; set++) {
@ -93,7 +95,8 @@ public class ResultValuator {
final double bm25 = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx);
final double bm25p = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx);
double score = normalize(bm25 + bm25p + tcf + overallPart, keywordSet.length());
double nonNormalizedScore = bm25 + bm25p + tcf + overallPart;
double score = normalize(nonNormalizedScore, keywordSet.length());
bestScore = min(bestScore, score);
@ -102,16 +105,55 @@ public class ResultValuator {
return bestScore;
}
private int getUrlTypePenalty(long documentMetadata) {
// Long urls-that-look-like-this tend to be poor search results
if (DocumentMetadata.hasFlags(documentMetadata,
HtmlFeature.LONG_URL.getFeatureBit()
| HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) {
return 2;
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
if (size < 400) {
if (quality < 5)
return 0;
return -quality * rankingParams.qualityPenalty;
}
else {
return -quality * rankingParams.qualityPenalty * 20;
}
}
private int flagsPenalty(int featureFlags, long docFlags, int size, double quality) {
// Short-circuit for index-service, which does not have the feature flags
if (featureFlags == 0)
return 0;
double penalty = 0;
boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);
// Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
double largeSiteFactor = 1.;
if (!isForum && size > 400) {
// Long urls-that-look-like-this tend to be poor search results
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
penalty += 30.0;
else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
penalty += 30.;
else penalty += 5.;
largeSiteFactor = 2;
}
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
penalty += 5.0 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
penalty += 5.0 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
penalty += 2.5 * largeSiteFactor;
if (isForum) {
penalty = Math.min(0, penalty - 2);
}
return (int) -penalty;
}
private long documentMetadata(List<SearchResultKeywordScore> rawScores) {
@ -121,6 +163,13 @@ public class ResultValuator {
return 0;
}
private int htmlFeatures(List<SearchResultKeywordScore> rawScores) {
for (var score : rawScores) {
return score.htmlFeatures();
}
return 0;
}
private ResultKeywordSet createKeywordSet(ValuatorListPool<SearchResultKeywordScore> listPool,
List<SearchResultKeywordScore> rawScores,
int thisSet)

View File

@ -40,20 +40,20 @@ class ResultValuatorTest {
new SearchResultKeywordScore(0, "bob",
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
false)
0, false)
);
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
new SearchResultKeywordScore(0, "bob",
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
false)
0, false)
);
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
new SearchResultKeywordScore(0, "bob",
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
false)
0, false)
);

View File

@ -89,7 +89,7 @@ class TermCoherenceFactorTest {
for (int i = 0; i < positionMasks.length; i++) {
keywords.add(new SearchResultKeywordScore(0, "",
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, false));
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0, false));
}
return new ResultKeywordSet(keywords);

View File

@ -19,7 +19,7 @@ public interface Interpreter {
default void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}
default void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {}
default void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {}
default void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {}
default void loadDomainRedirect(DomainLink link) {}

View File

@ -7,11 +7,11 @@ import nu.marginalia.converting.instruction.InstructionTag;
import nu.marginalia.converting.instruction.Interpreter;
import nu.marginalia.model.EdgeUrl;
public record LoadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) implements Instruction {
public record LoadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) implements Instruction {
@Override
public void apply(Interpreter interpreter) {
interpreter.loadKeywords(url, metadata, words);
interpreter.loadKeywords(url, features, metadata, words);
}
@Override

View File

@ -130,7 +130,7 @@ public class InstructionWriterFactory {
}
@Override
public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {
public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {
keywords++;
}

View File

@ -31,11 +31,16 @@ public class DocumentsCompiler {
}
}
public void compileWords(Consumer<Instruction> instructionConsumer, ProcessedDocument doc) {
public void compileWords(Consumer<Instruction> instructionConsumer,
ProcessedDocument doc) {
var words = doc.words;
if (words != null) {
instructionConsumer.accept(new LoadKeywords(doc.url, doc.details.metadata, words.build()));
instructionConsumer.accept(new LoadKeywords(doc.url,
HtmlFeature.encode(doc.details.features),
doc.details.metadata,
words.build())
);
}
}

View File

@ -17,7 +17,7 @@ import nu.marginalia.model.crawl.UrlIndexingState;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.io.*;
import java.nio.file.Path;
import java.time.LocalTime;
import java.util.*;
@ -143,4 +143,5 @@ public class ConvertingIntegrationTest {
return SerializableCrawlDataStream.fromIterator(data.iterator());
}
}

View File

@ -18,7 +18,11 @@ public class IndexLoadKeywords implements Runnable {
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
private final LoaderIndexJournalWriter journalWriter;
private record InsertTask(int urlId, int domainId, DocumentMetadata metadata, DocumentKeywords wordSet) {}
private record InsertTask(int urlId,
int domainId,
int features,
DocumentMetadata metadata,
DocumentKeywords wordSet) {}
private final Thread runThread;
@ -36,7 +40,10 @@ public class IndexLoadKeywords implements Runnable {
while (!canceled) {
var data = insertQueue.poll(1, TimeUnit.SECONDS);
if (data != null) {
journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet);
journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId),
data.features,
data.metadata(),
data.wordSet);
}
}
}
@ -49,7 +56,11 @@ public class IndexLoadKeywords implements Runnable {
}
}
public void load(LoaderData loaderData, EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) throws InterruptedException {
public void load(LoaderData loaderData,
EdgeUrl url,
int features,
DocumentMetadata metadata,
DocumentKeywords words) throws InterruptedException {
int domainId = loaderData.getDomainId(url.domain);
int urlId = loaderData.getUrlId(url);
@ -58,6 +69,6 @@ public class IndexLoadKeywords implements Runnable {
return;
}
insertQueue.put(new InsertTask(urlId, domainId, metadata, words));
insertQueue.put(new InsertTask(urlId, domainId, features, metadata, words));
}
}

View File

@ -103,9 +103,9 @@ public class Loader implements Interpreter, AutoCloseable {
}
@Override
public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {
public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {
try {
indexLoadKeywords.load(data, url, metadata, words);
indexLoadKeywords.load(data, url, features, metadata, words);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}

View File

@ -60,6 +60,7 @@ public class LoaderIndexJournalWriter {
@SneakyThrows
public void putWords(EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
int features,
DocumentMetadata metadata,
DocumentKeywords wordSet) {
if (wordSet.keywords().length == 0) {
@ -76,10 +77,10 @@ public class LoaderIndexJournalWriter {
// with a chonky work queue is a fairly decent improvement
for (var chunk : KeywordListChunker.chopList(wordSet, IndexJournalEntryData.MAX_LENGTH)) {
try {
keywordInsertionExecutor.submit(() -> loadWords(domain, url, metadata, chunk));
keywordInsertionExecutor.submit(() -> loadWords(domain, url, features, metadata, chunk));
}
catch (RejectedExecutionException ex) {
loadWords(domain, url, metadata, chunk);
loadWords(domain, url, features, metadata, chunk);
}
}
@ -87,6 +88,7 @@ public class LoaderIndexJournalWriter {
private void loadWords(EdgeId<EdgeDomain> domain,
EdgeId<EdgeUrl> url,
int features,
DocumentMetadata metadata,
DocumentKeywords wordSet) {
if (null == metadata) {
@ -95,7 +97,7 @@ public class LoaderIndexJournalWriter {
}
var entry = new IndexJournalEntryData(getOrInsertWordIds(wordSet.keywords(), wordSet.metadata()));
var header = new IndexJournalEntryHeader(domain, url, metadata.encode());
var header = new IndexJournalEntryHeader(domain, features, url, metadata.encode());
indexWriter.put(header, entry);
}

View File

@ -196,6 +196,9 @@ public class SearchIndex {
public long getDocumentMetadata(long docId) {
return indexReader.getDocumentMetadata(docId);
}
public int getHtmlFeatures(long docId) {
return indexReader.getHtmlFeatures(docId);
}
public int getDomainId(long docId) {
return indexReader.getDomainId(docId);

View File

@ -67,4 +67,8 @@ public class SearchIndexReader {
public int totalDocCount() {
return forwardIndexReader.totalDocCount();
}
public int getHtmlFeatures(long docId) {
return forwardIndexReader.getHtmlFeatures(docId);
}
}

View File

@ -34,6 +34,10 @@ public class IndexMetadataService {
return index.getDocumentMetadata(urlId);
}
public int getHtmlFeatures(long urlId) {
return index.getHtmlFeatures(urlId);
}
public int getDomainId(long urlId) {
return index.getDomainId(urlId);
}

View File

@ -59,6 +59,7 @@ public class IndexResultValuator {
searchResult.setDomainId(metadataService.getDomainId(urlIdInt));
long docMetadata = metadataService.getDocumentMetadata(urlIdInt);
int htmlFeatures = metadataService.getHtmlFeatures(urlIdInt);
int maxFlagsCount = 0;
boolean anyAllSynthetic = false;
@ -85,6 +86,7 @@ public class IndexResultValuator {
searchTerm,
metadata,
docMetadata,
htmlFeatures,
resultsWithPriorityTerms.contains(searchResult.combinedId)
);

View File

@ -177,7 +177,7 @@ public class IndexQueryServiceIntegrationTest {
long fullId = id | ((long) (32 - (id % 32)) << 32);
var header = new IndexJournalEntryHeader(factors.length, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {
@ -190,7 +190,7 @@ public class IndexQueryServiceIntegrationTest {
public void loadDataWithDomain(int domain, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
var header = new IndexJournalEntryHeader(factors.length, id | ((long) domain << 32), DocumentMetadata.defaultValue());
var header = new IndexJournalEntryHeader(factors.length, 0, id | ((long) domain << 32), DocumentMetadata.defaultValue());
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {