(forward-index, valuator) HTML features in valuator
Put it in the forward index for easy access during index-side valuation.
This commit is contained in:
parent
fcfe07fb7d
commit
704de50a9b
@ -2,7 +2,6 @@ package nu.marginalia.index.client.model.results;
|
||||
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
|
||||
import java.util.Objects;
|
||||
@ -14,15 +13,19 @@ public final class SearchResultKeywordScore {
|
||||
private final long encodedDocMetadata;
|
||||
private final boolean hasPriorityTerms;
|
||||
|
||||
private final int htmlFeatures;
|
||||
|
||||
public SearchResultKeywordScore(int subquery,
|
||||
String keyword,
|
||||
long encodedWordMetadata,
|
||||
long encodedDocMetadata,
|
||||
int htmlFeatures,
|
||||
boolean hasPriorityTerms) {
|
||||
this.subquery = subquery;
|
||||
this.keyword = keyword;
|
||||
this.encodedWordMetadata = encodedWordMetadata;
|
||||
this.encodedDocMetadata = encodedDocMetadata;
|
||||
this.htmlFeatures = htmlFeatures;
|
||||
this.hasPriorityTerms = hasPriorityTerms;
|
||||
}
|
||||
|
||||
@ -58,6 +61,10 @@ public final class SearchResultKeywordScore {
|
||||
return encodedDocMetadata;
|
||||
}
|
||||
|
||||
public int htmlFeatures() {
|
||||
return htmlFeatures;
|
||||
}
|
||||
|
||||
public boolean hasPriorityTerms() {
|
||||
return hasPriorityTerms;
|
||||
}
|
||||
|
@ -88,8 +88,9 @@ public class ForwardIndexConverter {
|
||||
int ranking = domainRankings.getRanking(entry.domainId());
|
||||
long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);
|
||||
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures());
|
||||
});
|
||||
|
||||
progress.progress(TaskSteps.FORCE);
|
||||
|
@ -1,8 +1,9 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
class ForwardIndexParameters {
|
||||
public static final int ENTRY_SIZE = 2;
|
||||
public static final int ENTRY_SIZE = 3;
|
||||
public static final int DOMAIN_OFFSET = 0;
|
||||
public static final int METADATA_OFFSET = 1;
|
||||
public static final int FEATURES_OFFSET = 2;
|
||||
|
||||
}
|
||||
|
@ -77,6 +77,13 @@ public class ForwardIndexReader {
|
||||
return data.get(ENTRY_SIZE * offset + METADATA_OFFSET);
|
||||
}
|
||||
|
||||
public int getHtmlFeatures(long docId) {
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) return 0;
|
||||
|
||||
return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
|
||||
}
|
||||
|
||||
public int getDomainId(long docId) {
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) return 0;
|
||||
|
@ -7,7 +7,7 @@ import nu.marginalia.model.id.EdgeId;
|
||||
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {
|
||||
|
||||
public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) {
|
||||
return new IndexJournalEntryBuilder(documentId, documentMeta);
|
||||
return new IndexJournalEntryBuilder(0, documentId, documentMeta);
|
||||
}
|
||||
|
||||
public static IndexJournalEntryBuilder builder(int domainId,
|
||||
@ -15,7 +15,9 @@ public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntr
|
||||
long documentMeta) {
|
||||
|
||||
|
||||
return builder(new EdgeId<>(domainId), new EdgeId<>(urlId), documentMeta);
|
||||
return builder(new EdgeId<>(domainId),
|
||||
new EdgeId<>(urlId),
|
||||
documentMeta);
|
||||
}
|
||||
|
||||
public static IndexJournalEntryBuilder builder(EdgeId<EdgeDomain> domainId,
|
||||
@ -23,6 +25,8 @@ public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntr
|
||||
long documentMeta) {
|
||||
|
||||
|
||||
return new IndexJournalEntryBuilder(IndexJournalEntryHeader.combineIds(domainId, urlId), documentMeta);
|
||||
return new IndexJournalEntryBuilder(0,
|
||||
IndexJournalEntryHeader.combineIds(domainId, urlId),
|
||||
documentMeta);
|
||||
}
|
||||
}
|
||||
|
@ -4,10 +4,15 @@ import gnu.trove.list.array.TLongArrayList;
|
||||
|
||||
public class IndexJournalEntryBuilder {
|
||||
private final long documentId;
|
||||
private final int documentFeatures;
|
||||
private final long documentMeta;
|
||||
private final TLongArrayList items = new TLongArrayList();
|
||||
|
||||
public IndexJournalEntryBuilder(long documentId, long documentMeta) {
|
||||
public IndexJournalEntryBuilder(
|
||||
int documentFeatures,
|
||||
long documentId,
|
||||
long documentMeta) {
|
||||
this.documentFeatures = documentFeatures;
|
||||
this.documentId = documentId;
|
||||
this.documentMeta = documentMeta;
|
||||
}
|
||||
@ -22,7 +27,10 @@ public class IndexJournalEntryBuilder {
|
||||
|
||||
public IndexJournalEntry build() {
|
||||
return new IndexJournalEntry(
|
||||
new IndexJournalEntryHeader(items.size(), documentId, documentMeta),
|
||||
new IndexJournalEntryHeader(items.size(),
|
||||
documentFeatures,
|
||||
documentId,
|
||||
documentMeta),
|
||||
new IndexJournalEntryData(items.toArray())
|
||||
);
|
||||
}
|
||||
|
@ -4,10 +4,19 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.EdgeId;
|
||||
|
||||
public record IndexJournalEntryHeader(int entrySize, long combinedId, long documentMeta) {
|
||||
public record IndexJournalEntryHeader(int entrySize,
|
||||
int documentFeatures,
|
||||
long combinedId,
|
||||
long documentMeta) {
|
||||
|
||||
public IndexJournalEntryHeader(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId, long documentMeta) {
|
||||
this(-1, combineIds(domainId, urlId), documentMeta);
|
||||
public IndexJournalEntryHeader(EdgeId<EdgeDomain> domainId,
|
||||
int documentFeatures,
|
||||
EdgeId<EdgeUrl> urlId,
|
||||
long documentMeta) {
|
||||
this(-1,
|
||||
documentFeatures,
|
||||
combineIds(domainId, urlId),
|
||||
documentMeta);
|
||||
}
|
||||
|
||||
static long combineIds(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId) {
|
||||
|
@ -30,6 +30,7 @@ public class IndexJournalReadEntry {
|
||||
|
||||
var header = new IndexJournalEntryHeader(
|
||||
(int) (sizeBlock >>> 32L),
|
||||
(int) (sizeBlock & 0xFFFF_FFFFL),
|
||||
docId,
|
||||
meta);
|
||||
|
||||
|
@ -72,7 +72,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
|
||||
}
|
||||
|
||||
dataBuffer.putInt(entry.size());
|
||||
dataBuffer.putInt(0);
|
||||
dataBuffer.putInt(header.documentFeatures());
|
||||
dataBuffer.putLong(header.combinedId());
|
||||
dataBuffer.putLong(header.documentMeta());
|
||||
|
||||
|
@ -106,7 +106,7 @@ class ReverseIndexFullConverterTest2 {
|
||||
}
|
||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
||||
int[] factors = getFactorsI(id);
|
||||
var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5);
|
||||
var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5);
|
||||
|
||||
long[] data = new long[factors.length*2];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
|
@ -106,7 +106,7 @@ class ReverseIndexPriorityConverterTest2 {
|
||||
}
|
||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
||||
int[] factors = getFactorsI(id);
|
||||
var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5);
|
||||
var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5);
|
||||
|
||||
long[] data = new long[factors.length*2];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.index.client.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.ranking.factors.*;
|
||||
|
||||
@ -48,19 +49,20 @@ public class ResultValuator {
|
||||
double bestScore = 10;
|
||||
|
||||
long documentMetadata = documentMetadata(scores);
|
||||
|
||||
int features = htmlFeatures(scores);
|
||||
var rankingParams = ctx.params;
|
||||
|
||||
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
||||
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
|
||||
int quality = DocumentMetadata.decodeQuality(documentMetadata);
|
||||
int urlTypePenalty = getUrlTypePenalty(documentMetadata);
|
||||
int size = DocumentMetadata.decodeSize(documentMetadata);
|
||||
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size, quality);
|
||||
int topology = DocumentMetadata.decodeTopology(documentMetadata);
|
||||
int year = DocumentMetadata.decodeYear(documentMetadata);
|
||||
|
||||
double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty);
|
||||
|
||||
final double qualityPenalty = -quality * rankingParams.qualityPenalty;
|
||||
final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
|
||||
final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus;
|
||||
final double topologyBonus = Math.log(1 + topology);
|
||||
final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty;
|
||||
@ -80,7 +82,7 @@ public class ResultValuator {
|
||||
+ rankingBonus
|
||||
+ topologyBonus
|
||||
+ temporalBias
|
||||
+ urlTypePenalty
|
||||
+ flagsPenalty
|
||||
+ priorityTermBonus.calculate(scores);
|
||||
|
||||
for (int set = 0; set <= sets; set++) {
|
||||
@ -93,7 +95,8 @@ public class ResultValuator {
|
||||
final double bm25 = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx);
|
||||
final double bm25p = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx);
|
||||
|
||||
double score = normalize(bm25 + bm25p + tcf + overallPart, keywordSet.length());
|
||||
double nonNormalizedScore = bm25 + bm25p + tcf + overallPart;
|
||||
double score = normalize(nonNormalizedScore, keywordSet.length());
|
||||
|
||||
bestScore = min(bestScore, score);
|
||||
|
||||
@ -102,16 +105,55 @@ public class ResultValuator {
|
||||
return bestScore;
|
||||
}
|
||||
|
||||
private int getUrlTypePenalty(long documentMetadata) {
|
||||
|
||||
// Long urls-that-look-like-this tend to be poor search results
|
||||
if (DocumentMetadata.hasFlags(documentMetadata,
|
||||
HtmlFeature.LONG_URL.getFeatureBit()
|
||||
| HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) {
|
||||
return 2;
|
||||
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
||||
if (size < 400) {
|
||||
if (quality < 5)
|
||||
return 0;
|
||||
return -quality * rankingParams.qualityPenalty;
|
||||
}
|
||||
else {
|
||||
return -quality * rankingParams.qualityPenalty * 20;
|
||||
}
|
||||
}
|
||||
|
||||
private int flagsPenalty(int featureFlags, long docFlags, int size, double quality) {
|
||||
|
||||
// Short-circuit for index-service, which does not have the feature flags
|
||||
if (featureFlags == 0)
|
||||
return 0;
|
||||
|
||||
double penalty = 0;
|
||||
|
||||
boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);
|
||||
|
||||
// Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
|
||||
double largeSiteFactor = 1.;
|
||||
|
||||
if (!isForum && size > 400) {
|
||||
// Long urls-that-look-like-this tend to be poor search results
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
|
||||
penalty += 30.0;
|
||||
else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
|
||||
penalty += 30.;
|
||||
else penalty += 5.;
|
||||
|
||||
largeSiteFactor = 2;
|
||||
}
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
|
||||
penalty += 5.0 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
|
||||
penalty += 5.0 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (isForum) {
|
||||
penalty = Math.min(0, penalty - 2);
|
||||
}
|
||||
|
||||
return (int) -penalty;
|
||||
}
|
||||
|
||||
private long documentMetadata(List<SearchResultKeywordScore> rawScores) {
|
||||
@ -121,6 +163,13 @@ public class ResultValuator {
|
||||
return 0;
|
||||
}
|
||||
|
||||
private int htmlFeatures(List<SearchResultKeywordScore> rawScores) {
|
||||
for (var score : rawScores) {
|
||||
return score.htmlFeatures();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private ResultKeywordSet createKeywordSet(ValuatorListPool<SearchResultKeywordScore> listPool,
|
||||
List<SearchResultKeywordScore> rawScores,
|
||||
int thisSet)
|
||||
|
@ -40,20 +40,20 @@ class ResultValuatorTest {
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
|
||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
false)
|
||||
0, false)
|
||||
);
|
||||
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
|
||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
false)
|
||||
0, false)
|
||||
);
|
||||
|
||||
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
|
||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
false)
|
||||
0, false)
|
||||
);
|
||||
|
||||
|
||||
|
@ -89,7 +89,7 @@ class TermCoherenceFactorTest {
|
||||
|
||||
for (int i = 0; i < positionMasks.length; i++) {
|
||||
keywords.add(new SearchResultKeywordScore(0, "",
|
||||
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, false));
|
||||
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0, false));
|
||||
}
|
||||
|
||||
return new ResultKeywordSet(keywords);
|
||||
|
@ -19,7 +19,7 @@ public interface Interpreter {
|
||||
default void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}
|
||||
default void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {}
|
||||
|
||||
default void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {}
|
||||
default void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {}
|
||||
|
||||
default void loadDomainRedirect(DomainLink link) {}
|
||||
|
||||
|
@ -7,11 +7,11 @@ import nu.marginalia.converting.instruction.InstructionTag;
|
||||
import nu.marginalia.converting.instruction.Interpreter;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
public record LoadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) implements Instruction {
|
||||
public record LoadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) implements Instruction {
|
||||
|
||||
@Override
|
||||
public void apply(Interpreter interpreter) {
|
||||
interpreter.loadKeywords(url, metadata, words);
|
||||
interpreter.loadKeywords(url, features, metadata, words);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -130,7 +130,7 @@ public class InstructionWriterFactory {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {
|
||||
public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {
|
||||
keywords++;
|
||||
}
|
||||
|
||||
|
@ -31,11 +31,16 @@ public class DocumentsCompiler {
|
||||
}
|
||||
}
|
||||
|
||||
public void compileWords(Consumer<Instruction> instructionConsumer, ProcessedDocument doc) {
|
||||
public void compileWords(Consumer<Instruction> instructionConsumer,
|
||||
ProcessedDocument doc) {
|
||||
var words = doc.words;
|
||||
|
||||
if (words != null) {
|
||||
instructionConsumer.accept(new LoadKeywords(doc.url, doc.details.metadata, words.build()));
|
||||
instructionConsumer.accept(new LoadKeywords(doc.url,
|
||||
HtmlFeature.encode(doc.details.features),
|
||||
doc.details.metadata,
|
||||
words.build())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,7 @@ import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.*;
|
||||
import java.nio.file.Path;
|
||||
import java.time.LocalTime;
|
||||
import java.util.*;
|
||||
@ -143,4 +143,5 @@ public class ConvertingIntegrationTest {
|
||||
|
||||
return SerializableCrawlDataStream.fromIterator(data.iterator());
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -18,7 +18,11 @@ public class IndexLoadKeywords implements Runnable {
|
||||
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
|
||||
private final LoaderIndexJournalWriter journalWriter;
|
||||
|
||||
private record InsertTask(int urlId, int domainId, DocumentMetadata metadata, DocumentKeywords wordSet) {}
|
||||
private record InsertTask(int urlId,
|
||||
int domainId,
|
||||
int features,
|
||||
DocumentMetadata metadata,
|
||||
DocumentKeywords wordSet) {}
|
||||
|
||||
private final Thread runThread;
|
||||
|
||||
@ -36,7 +40,10 @@ public class IndexLoadKeywords implements Runnable {
|
||||
while (!canceled) {
|
||||
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
||||
if (data != null) {
|
||||
journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet);
|
||||
journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId),
|
||||
data.features,
|
||||
data.metadata(),
|
||||
data.wordSet);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -49,7 +56,11 @@ public class IndexLoadKeywords implements Runnable {
|
||||
}
|
||||
}
|
||||
|
||||
public void load(LoaderData loaderData, EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) throws InterruptedException {
|
||||
public void load(LoaderData loaderData,
|
||||
EdgeUrl url,
|
||||
int features,
|
||||
DocumentMetadata metadata,
|
||||
DocumentKeywords words) throws InterruptedException {
|
||||
int domainId = loaderData.getDomainId(url.domain);
|
||||
int urlId = loaderData.getUrlId(url);
|
||||
|
||||
@ -58,6 +69,6 @@ public class IndexLoadKeywords implements Runnable {
|
||||
return;
|
||||
}
|
||||
|
||||
insertQueue.put(new InsertTask(urlId, domainId, metadata, words));
|
||||
insertQueue.put(new InsertTask(urlId, domainId, features, metadata, words));
|
||||
}
|
||||
}
|
||||
|
@ -103,9 +103,9 @@ public class Loader implements Interpreter, AutoCloseable {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {
|
||||
public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {
|
||||
try {
|
||||
indexLoadKeywords.load(data, url, metadata, words);
|
||||
indexLoadKeywords.load(data, url, features, metadata, words);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
@ -60,6 +60,7 @@ public class LoaderIndexJournalWriter {
|
||||
|
||||
@SneakyThrows
|
||||
public void putWords(EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
||||
int features,
|
||||
DocumentMetadata metadata,
|
||||
DocumentKeywords wordSet) {
|
||||
if (wordSet.keywords().length == 0) {
|
||||
@ -76,10 +77,10 @@ public class LoaderIndexJournalWriter {
|
||||
// with a chonky work queue is a fairly decent improvement
|
||||
for (var chunk : KeywordListChunker.chopList(wordSet, IndexJournalEntryData.MAX_LENGTH)) {
|
||||
try {
|
||||
keywordInsertionExecutor.submit(() -> loadWords(domain, url, metadata, chunk));
|
||||
keywordInsertionExecutor.submit(() -> loadWords(domain, url, features, metadata, chunk));
|
||||
}
|
||||
catch (RejectedExecutionException ex) {
|
||||
loadWords(domain, url, metadata, chunk);
|
||||
loadWords(domain, url, features, metadata, chunk);
|
||||
}
|
||||
}
|
||||
|
||||
@ -87,6 +88,7 @@ public class LoaderIndexJournalWriter {
|
||||
|
||||
private void loadWords(EdgeId<EdgeDomain> domain,
|
||||
EdgeId<EdgeUrl> url,
|
||||
int features,
|
||||
DocumentMetadata metadata,
|
||||
DocumentKeywords wordSet) {
|
||||
if (null == metadata) {
|
||||
@ -95,7 +97,7 @@ public class LoaderIndexJournalWriter {
|
||||
}
|
||||
|
||||
var entry = new IndexJournalEntryData(getOrInsertWordIds(wordSet.keywords(), wordSet.metadata()));
|
||||
var header = new IndexJournalEntryHeader(domain, url, metadata.encode());
|
||||
var header = new IndexJournalEntryHeader(domain, features, url, metadata.encode());
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
}
|
||||
|
@ -196,6 +196,9 @@ public class SearchIndex {
|
||||
public long getDocumentMetadata(long docId) {
|
||||
return indexReader.getDocumentMetadata(docId);
|
||||
}
|
||||
public int getHtmlFeatures(long docId) {
|
||||
return indexReader.getHtmlFeatures(docId);
|
||||
}
|
||||
|
||||
public int getDomainId(long docId) {
|
||||
return indexReader.getDomainId(docId);
|
||||
|
@ -67,4 +67,8 @@ public class SearchIndexReader {
|
||||
public int totalDocCount() {
|
||||
return forwardIndexReader.totalDocCount();
|
||||
}
|
||||
|
||||
public int getHtmlFeatures(long docId) {
|
||||
return forwardIndexReader.getHtmlFeatures(docId);
|
||||
}
|
||||
}
|
||||
|
@ -34,6 +34,10 @@ public class IndexMetadataService {
|
||||
return index.getDocumentMetadata(urlId);
|
||||
}
|
||||
|
||||
public int getHtmlFeatures(long urlId) {
|
||||
return index.getHtmlFeatures(urlId);
|
||||
}
|
||||
|
||||
public int getDomainId(long urlId) {
|
||||
return index.getDomainId(urlId);
|
||||
}
|
||||
|
@ -59,6 +59,7 @@ public class IndexResultValuator {
|
||||
searchResult.setDomainId(metadataService.getDomainId(urlIdInt));
|
||||
|
||||
long docMetadata = metadataService.getDocumentMetadata(urlIdInt);
|
||||
int htmlFeatures = metadataService.getHtmlFeatures(urlIdInt);
|
||||
|
||||
int maxFlagsCount = 0;
|
||||
boolean anyAllSynthetic = false;
|
||||
@ -85,6 +86,7 @@ public class IndexResultValuator {
|
||||
searchTerm,
|
||||
metadata,
|
||||
docMetadata,
|
||||
htmlFeatures,
|
||||
resultsWithPriorityTerms.contains(searchResult.combinedId)
|
||||
);
|
||||
|
||||
|
@ -177,7 +177,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
|
||||
long fullId = id | ((long) (32 - (id % 32)) << 32);
|
||||
|
||||
var header = new IndexJournalEntryHeader(factors.length, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||
|
||||
long[] data = new long[factors.length*2];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
@ -190,7 +190,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
|
||||
public void loadDataWithDomain(int domain, int id) {
|
||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||
var header = new IndexJournalEntryHeader(factors.length, id | ((long) domain << 32), DocumentMetadata.defaultValue());
|
||||
var header = new IndexJournalEntryHeader(factors.length, 0, id | ((long) domain << 32), DocumentMetadata.defaultValue());
|
||||
|
||||
long[] data = new long[factors.length*2];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
|
Loading…
Reference in New Issue
Block a user