A lot of readmes, some refactoring.

This commit is contained in:
Viktor Lofgren 2023-03-06 18:32:13 +01:00
parent f19c9a2863
commit b945fd7f39
118 changed files with 889 additions and 366 deletions

View File

@ -0,0 +1,8 @@
# Assistant API
Client and models for talking to the [assistant-service](../../services-core/assistant-service),
implemented with the base client from [service-client](../../common/service-client).
## Central Classes
* [AssistantClient](src/main/java/nu/marginalia/assistant/client/AssistantClient.java)

8
api/index-api/readme.md Normal file
View File

@ -0,0 +1,8 @@
# Index API
Client and models for talking to the [index-service](../../services-core/index-service),
implemented with the base client from [service-client](../../common/service-client).
## Central Classes
* [IndexClient](src/main/java/nu/marginalia/index/client/IndexClient.java)

View File

@ -18,12 +18,12 @@ import javax.annotation.CheckReturnValue;
import java.util.List; import java.util.List;
@Singleton @Singleton
public class EdgeIndexClient extends AbstractDynamicClient { public class IndexClient extends AbstractDynamicClient {
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register(); private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
@Inject @Inject
public EdgeIndexClient(ServiceDescriptors descriptors) { public IndexClient(ServiceDescriptors descriptors) {
super(descriptors.forId(ServiceId.Index), WmsaHome.getHostsFile(), GsonFactory::get); super(descriptors.forId(ServiceId.Index), WmsaHome.getHostsFile(), GsonFactory::get);
setTimeout(30); setTimeout(30);

View File

@ -1,9 +1,9 @@
package nu.marginalia.index.client.model.results; package nu.marginalia.index.client.model.results;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.EdgePageWordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.crawl.EdgePageDocumentFlags; import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import static java.lang.Integer.lowestOneBit; import static java.lang.Integer.lowestOneBit;
import static java.lang.Integer.numberOfTrailingZeros; import static java.lang.Integer.numberOfTrailingZeros;
@ -16,15 +16,15 @@ public record EdgeSearchResultKeywordScore(int set,
public double documentValue() { public double documentValue() {
long sum = 0; long sum = 0;
sum += EdgePageDocumentsMetadata.decodeQuality(encodedDocMetadata) / 5.; sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
sum += EdgePageDocumentsMetadata.decodeTopology(encodedDocMetadata); sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
if (EdgePageDocumentsMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) { if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
sum += 20; sum += 20;
} }
int rank = EdgePageDocumentsMetadata.decodeRank(encodedDocMetadata) - 13; int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
if (rank < 0) if (rank < 0)
sum += rank / 2; sum += rank / 2;
else else
@ -34,7 +34,7 @@ public record EdgeSearchResultKeywordScore(int set,
} }
private boolean hasTermFlag(EdgePageWordFlags flag) { private boolean hasTermFlag(EdgePageWordFlags flag) {
return EdgePageWordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
} }
public double termValue() { public double termValue() {
@ -58,7 +58,7 @@ public record EdgeSearchResultKeywordScore(int set,
sum -= 1; sum -= 1;
} }
sum -= EdgePageWordMetadata.decodeTfidf(encodedWordMetadata) / 50.; sum -= WordMetadata.decodeTfidf(encodedWordMetadata) / 50.;
sum += firstPos() / 5.; sum += firstPos() / 5.;
sum -= Integer.bitCount(positions()) / 3.; sum -= Integer.bitCount(positions()) / 3.;
@ -66,9 +66,9 @@ public record EdgeSearchResultKeywordScore(int set,
} }
public int firstPos() { public int firstPos() {
return numberOfTrailingZeros(lowestOneBit(EdgePageWordMetadata.decodePositions(encodedWordMetadata))); return numberOfTrailingZeros(lowestOneBit(WordMetadata.decodePositions(encodedWordMetadata)));
} }
public int positions() { return EdgePageWordMetadata.decodePositions(encodedWordMetadata); } public int positions() { return WordMetadata.decodePositions(encodedWordMetadata); }
public boolean isSpecial() { return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic); } public boolean isSpecial() { return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic); }
public boolean isRegular() { public boolean isRegular() {
return !keyword.contains(":") return !keyword.contains(":")

View File

@ -1,7 +1,8 @@
# Core Service Clients # Core Service Clients
These are clients for the [core services](../services-core/), along with what models These are clients for the [core services](../services-core/), along with what models
are necessary for speaking to them. are necessary for speaking to them. They each implement the abstract client classes from
[service-client](../common/service-client).
All that is necessary is to `@Inject` them into the constructor and then All that is necessary is to `@Inject` them into the constructor and then
requests can be sent. requests can be sent.

8
api/search-api/readme.md Normal file
View File

@ -0,0 +1,8 @@
# Search API
Client and models for talking to the [search-service](../../services-core/search-service),
implemented with the base client from [service-client](../../common/service-client).
## Central Classes
* [SearchClient](src/main/java/nu/marginalia/search/client/SearchClient.java)

View File

@ -18,11 +18,11 @@ import java.net.URLEncoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
@Singleton @Singleton
public class EdgeSearchClient extends AbstractDynamicClient { public class SearchClient extends AbstractDynamicClient {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject @Inject
public EdgeSearchClient(ServiceDescriptors descriptors) { public SearchClient(ServiceDescriptors descriptors) {
super(descriptors.forId(ServiceId.Search), WmsaHome.getHostsFile(), GsonFactory::get); super(descriptors.forId(ServiceId.Search), WmsaHome.getHostsFile(), GsonFactory::get);
} }

3
common/config/readme.md Normal file
View File

@ -0,0 +1,3 @@
# Config
This package contains configuration injectables used by the services.

View File

@ -13,7 +13,7 @@ java {
dependencies { dependencies {
implementation project(':common:service-discovery') implementation project(':common:service-discovery')
implementation project(':common:service-client') implementation project(':common:service-client')
implementation project(':libraries:misc') implementation project(':libraries:big-string')
implementation libs.lombok implementation libs.lombok
annotationProcessor libs.lombok annotationProcessor libs.lombok

11
common/model/readme.md Normal file
View File

@ -0,0 +1,11 @@
# Model
This package contains common models to the search engine
## Central Classes
* [EdgeDomain](src/main/java/nu/marginalia/model/EdgeDomain.java)
* [EdgeUrl](src/main/java/nu/marginalia/model/EdgeUrl.java)
* [EdgeId](src/main/java/nu/marginalia/model/id/EdgeId.java)
* [DocumentMetadata](src/main/java/nu/marginalia/model/idx/DocumentMetadata.java)
* [WordMetadata](src/main/java/nu/marginalia/model/idx/WordMetadata.java)

View File

@ -1,10 +0,0 @@
package nu.marginalia.model.crawl;
import lombok.*;
import nu.marginalia.model.EdgeDomain;
@AllArgsConstructor @EqualsAndHashCode @Getter @Setter @Builder @ToString
public class EdgeDomainLink {
public final EdgeDomain source;
public final EdgeDomain destination;
}

View File

@ -9,13 +9,13 @@ import java.util.Set;
import static java.lang.Math.max; import static java.lang.Math.max;
import static java.lang.Math.min; import static java.lang.Math.min;
public record EdgePageDocumentsMetadata(int rank, public record DocumentMetadata(int rank,
int encSize, int encSize,
int topology, int topology,
int year, int year,
int sets, int sets,
int quality, int quality,
byte flags) { byte flags) {
public static final long RANK_MASK = 0xFFL; public static final long RANK_MASK = 0xFFL;
@ -41,21 +41,21 @@ public record EdgePageDocumentsMetadata(int rank,
public static long defaultValue() { public static long defaultValue() {
return 0L; return 0L;
} }
public EdgePageDocumentsMetadata() { public DocumentMetadata() {
this(defaultValue()); this(defaultValue());
} }
public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) { public DocumentMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
this(0, 0, topology, year, sets, quality, encodeFlags(flags)); this(0, 0, topology, year, sets, quality, encodeFlags(flags));
} }
public EdgePageDocumentsMetadata withSize(int size) { public DocumentMetadata withSize(int size) {
if (size <= 0) { if (size <= 0) {
return this; return this;
} }
final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER)); final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER));
return new EdgePageDocumentsMetadata(rank, encSize, topology, year, sets, quality, flags); return new DocumentMetadata(rank, encSize, topology, year, sets, quality, flags);
} }
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) { private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
@ -68,7 +68,7 @@ public record EdgePageDocumentsMetadata(int rank,
return (flags & flag.asBit()) != 0; return (flags & flag.asBit()) != 0;
} }
public EdgePageDocumentsMetadata(long value) { public DocumentMetadata(long value) {
this( (int) ((value >>> RANK_SHIFT) & RANK_MASK), this( (int) ((value >>> RANK_SHIFT) & RANK_MASK),
(int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK), (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK), (int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),

View File

@ -9,10 +9,10 @@ import java.util.Set;
import static java.lang.Math.max; import static java.lang.Math.max;
import static java.lang.Math.min; import static java.lang.Math.min;
public record EdgePageWordMetadata(int tfIdf, public record WordMetadata(int tfIdf,
int positions, int positions,
int count, int count,
byte flags) { byte flags) {
public static final long COUNT_MASK = 0xFL; public static final long COUNT_MASK = 0xFL;
public static final int COUNT_SHIFT = 8; public static final int COUNT_SHIFT = 8;
@ -22,11 +22,11 @@ public record EdgePageWordMetadata(int tfIdf,
public static final int POSITIONS_SHIFT = 32; public static final int POSITIONS_SHIFT = 32;
public EdgePageWordMetadata() { public WordMetadata() {
this(emptyValue()); this(emptyValue());
} }
public EdgePageWordMetadata(long value) { public WordMetadata(long value) {
this( this(
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK), (int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
(int)(value >>> POSITIONS_SHIFT), (int)(value >>> POSITIONS_SHIFT),
@ -35,10 +35,10 @@ public record EdgePageWordMetadata(int tfIdf,
); );
} }
public EdgePageWordMetadata(int tfIdf, public WordMetadata(int tfIdf,
int positions, int positions,
int count, int count,
Set<EdgePageWordFlags> flags) Set<EdgePageWordFlags> flags)
{ {
this(tfIdf, positions, count, encodeFlags(flags)); this(tfIdf, positions, count, encodeFlags(flags));
} }

View File

@ -1,64 +0,0 @@
package nu.marginalia.model;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.EdgePageWordMetadata;
import org.junit.jupiter.api.Test;
import java.util.EnumSet;
import static org.junit.jupiter.api.Assertions.assertEquals;
class EdgePageWordMetadataTest {
@Test
public void codecTest() {
verifyCodec("Vanilla case", new EdgePageWordMetadata(32, 0x7f0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
verifyCodec("Position high", new EdgePageWordMetadata(32, 0xff0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
verifyCodec("No flags", new EdgePageWordMetadata(32, 0xff0f0000, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
System.out.println(new EdgePageWordMetadata(32, 0x7f0f0005, 1, EnumSet.allOf(EdgePageWordFlags.class)));
System.out.println(new EdgePageWordMetadata(32, 0xff0f0013, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
}
@Test
public void testClampTfIdfLow() {
var original = new EdgePageWordMetadata(0x8000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new EdgePageWordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(0, encoded.tfIdf());
}
@Test
public void testClampTfIdfHigh() {
var original = new EdgePageWordMetadata(0x7000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new EdgePageWordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(65535, encoded.tfIdf());
}
@Test
public void testClampCountLow() {
var original = new EdgePageWordMetadata(40, 0, -1, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new EdgePageWordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(0, encoded.count());
}
@Test
public void testClampCountHigh() {
var original = new EdgePageWordMetadata(40, 0, 17, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new EdgePageWordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(15, encoded.count());
}
public void verifyCodec(String message, EdgePageWordMetadata data) {
assertEquals(data, new EdgePageWordMetadata(data.encode()), message);
}
}

View File

@ -0,0 +1,64 @@
package nu.marginalia.model;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.WordMetadata;
import org.junit.jupiter.api.Test;
import java.util.EnumSet;
import static org.junit.jupiter.api.Assertions.assertEquals;
class WordMetadataTest {
@Test
public void codecTest() {
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
System.out.println(new WordMetadata(32, 0x7f0f0005, 1, EnumSet.allOf(EdgePageWordFlags.class)));
System.out.println(new WordMetadata(32, 0xff0f0013, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
}
@Test
public void testClampTfIdfLow() {
var original = new WordMetadata(0x8000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(0, encoded.tfIdf());
}
@Test
public void testClampTfIdfHigh() {
var original = new WordMetadata(0x7000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(65535, encoded.tfIdf());
}
@Test
public void testClampCountLow() {
var original = new WordMetadata(40, 0, -1, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(0, encoded.count());
}
@Test
public void testClampCountHigh() {
var original = new WordMetadata(40, 0, 17, EnumSet.noneOf(EdgePageWordFlags.class));
var encoded = new WordMetadata(original.encode());
assertEquals(original.positions(), encoded.positions());
assertEquals(15, encoded.count());
}
public void verifyCodec(String message, WordMetadata data) {
assertEquals(data, new WordMetadata(data.encode()), message);
}
}

View File

@ -4,6 +4,7 @@ These are packages containing the basic building blocks for running a service as
as shared models. as shared models.
* [config](config/) contains some `@Inject`ables. * [config](config/) contains some `@Inject`ables.
* [renderer](renderer/) contains utility code for rendering website templates.
* [service](service/) is the shared base classes for main methods and web services. * [service](service/) is the shared base classes for main methods and web services.
* [service-client](service-client/) is the shared base class for RPC. * [service-client](service-client/) is the shared base class for RPC.
* [service-discovery](service-discovery) contains tools that lets the services find each other. * [service-discovery](service-discovery) contains tools that lets the services find each other.

View File

@ -0,0 +1,7 @@
# Renderer
Renders handlebar-style templates for the user-facing services.
## Central Classes
* [Mustache Renderer](src/main/java/nu/marginalia/renderer/MustacheRenderer.java)

View File

@ -0,0 +1,10 @@
# Service Client
These are base classes for all the [API](../../api) clients for talking to other [services](../service).
## Central Classes
* [AbstractDynamicClient](src/main/java/nu/marginalia/client/AbstractDynamicClient.java) base class for API clients
* [AbstractClient](src/main/java/nu/marginalia/client/AbstractClient.java) handles requests at a lower level
* [Context](src/main/java/nu/marginalia/client/Context.java) handles request tracking
* [ContextScrambler](src/main/java/nu/marginalia/client/ContextScrambler.java) handles anonymization of public IPs

View File

@ -0,0 +1,3 @@
# Service Discovery
Contains classes for helping services discover each other.

9
common/service/readme.md Normal file
View File

@ -0,0 +1,9 @@
# Service
Contains the base classes for the services. This is where port configuration,
and common endpoints are set up.
## Central Classes
* [MainClass](src/main/java/nu/marginalia/service/MainClass.java) bootstraps all executables
* [Service](src/main/java/nu/marginalia/service/server/Service.java) base class for all services.

View File

@ -15,7 +15,7 @@ dependencies {
implementation project(':third-party') implementation project(':third-party')
implementation project(':common:model') implementation project(':common:model')
implementation project(':common:config') implementation project(':common:config')
implementation project(':libraries:misc') implementation project(':libraries:guarded-regex')
implementation project(':crawl:crawling-model') implementation project(':crawl:crawling-model')
implementation libs.notnull implementation libs.notnull

3
crawl/common/readme.md Normal file
View File

@ -0,0 +1,3 @@
# Crawl/Common
Contains model classes shared by the whole crawl-process-load ecosystem.

View File

@ -23,6 +23,7 @@ dependencies {
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.notnull implementation libs.notnull
implementation libs.trove
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit

View File

@ -0,0 +1,4 @@
# Converting Models
Contains models shared by the [converting-process](../converting-process/) and
[loading-process](../loading-process/).

View File

@ -3,8 +3,8 @@ package nu.marginalia.converting.instruction;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.crawl.DocumentKeywords; import nu.marginalia.converting.model.DocumentKeywords;
import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.converting.instruction.instructions.DomainLink;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
@ -19,7 +19,7 @@ public interface Interpreter {
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words); void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words);
void loadDomainRedirect(DomainLink link); void loadDomainRedirect(DomainLink link);
} }

View File

@ -1,13 +1,13 @@
package nu.marginalia.converting.instruction.instructions; package nu.marginalia.converting.instruction.instructions;
import nu.marginalia.model.crawl.DocumentKeywords; import nu.marginalia.converting.model.DocumentKeywords;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.converting.instruction.InstructionTag; import nu.marginalia.converting.instruction.InstructionTag;
import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.instruction.Interpreter;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
public record LoadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) implements Instruction { public record LoadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) implements Instruction {
@Override @Override
public void apply(Interpreter interpreter) { public void apply(Interpreter interpreter) {

View File

@ -1,7 +1,7 @@
package nu.marginalia.model.crawl; package nu.marginalia.converting.model;
import nu.marginalia.model.idx.EdgePageWordMetadata; import nu.marginalia.model.idx.WordMetadata;
import java.util.Arrays; import java.util.Arrays;
@ -9,7 +9,7 @@ public record DocumentKeywords(
String[] keywords, String[] keywords,
long[] metadata) { long[] metadata) {
public DocumentKeywords(EdgePageWords words) { DocumentKeywords(DocumentKeywordsBuilder words) {
this(words.words.toArray(String[]::new), this(words.words.toArray(String[]::new),
words.metadata.toArray()); words.metadata.toArray());
} }
@ -22,7 +22,7 @@ public record DocumentKeywords(
for (int i = 0; i < keywords.length; i++) { for (int i = 0; i < keywords.length; i++) {
sb.append("\n\t "); sb.append("\n\t ");
if (metadata[i] != 0) { if (metadata[i] != 0) {
sb.append(keywords[i]).append("/").append(new EdgePageWordMetadata(metadata[i])); sb.append(keywords[i]).append("/").append(new WordMetadata(metadata[i]));
} }
else { else {
sb.append(keywords[i]); sb.append(keywords[i]);

View File

@ -1,29 +1,30 @@
package nu.marginalia.model.crawl; package nu.marginalia.converting.model;
import gnu.trove.list.array.TLongArrayList; import gnu.trove.list.array.TLongArrayList;
import lombok.Getter; import lombok.Getter;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import java.util.ArrayList; import java.util.*;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.function.UnaryOperator; import java.util.function.UnaryOperator;
@ToString @Getter @ToString @Getter
public class EdgePageWords { public class DocumentKeywordsBuilder {
public final ArrayList<String> words = new ArrayList<>(); public final ArrayList<String> words = new ArrayList<>();
public final TLongArrayList metadata = new TLongArrayList(); public final TLongArrayList metadata = new TLongArrayList();
public DocumentKeywordsBuilder() {
public EdgePageWords() {
} }
public EdgePageWords(int cacpacity) { public DocumentKeywords build() {
return new DocumentKeywords(this);
}
public DocumentKeywordsBuilder(int cacpacity) {
words.ensureCapacity(cacpacity); words.ensureCapacity(cacpacity);
metadata.ensureCapacity(cacpacity); metadata.ensureCapacity(cacpacity);
} }
public EdgePageWords(Collection<Entry> initial) { public DocumentKeywordsBuilder(Collection<Entry> initial) {
words.ensureCapacity(initial.size()); words.ensureCapacity(initial.size());
metadata.ensureCapacity(initial.size()); metadata.ensureCapacity(initial.size());
@ -33,14 +34,14 @@ public class EdgePageWords {
} }
} }
public static EdgePageWords withBlankMetadata(List<String> entries) { public static DocumentKeywordsBuilder withBlankMetadata(List<String> entries) {
List<Long> emptyMeta = new ArrayList<>(entries.size()); List<Long> emptyMeta = new ArrayList<>(entries.size());
for (int i = 0; i < entries.size(); i++) { for (int i = 0; i < entries.size(); i++) {
emptyMeta.add(0L); emptyMeta.add(0L);
} }
return new EdgePageWords(entries, emptyMeta); return new DocumentKeywordsBuilder(entries, emptyMeta);
} }
public void addJustNoMeta(String word) { public void addJustNoMeta(String word) {
@ -48,7 +49,7 @@ public class EdgePageWords {
metadata.add(0); metadata.add(0);
} }
private EdgePageWords(List<String> words, List<Long> meta) { private DocumentKeywordsBuilder(List<String> words, List<Long> meta) {
this.words.addAll(words); this.words.addAll(words);
this.metadata.addAll(meta); this.metadata.addAll(meta);

View File

@ -21,7 +21,9 @@ dependencies {
implementation project(':common:model') implementation project(':common:model')
implementation project(':common:service') implementation project(':common:service')
implementation project(':common:config') implementation project(':common:config')
implementation project(':libraries:misc') implementation project(':libraries:guarded-regex')
implementation project(':libraries:easy-lsh')
implementation project(':libraries:big-string')
implementation project(':api:index-api') implementation project(':api:index-api')
implementation project(':common:service-discovery') implementation project(':common:service-discovery')
implementation project(':common:service-client') implementation project(':common:service-client')

View File

@ -2,9 +2,9 @@ package nu.marginalia.converting;
import com.github.luben.zstd.ZstdOutputStream; import com.github.luben.zstd.ZstdOutputStream;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.instruction.Interpreter;
import nu.marginalia.model.crawl.DocumentKeywords; import nu.marginalia.converting.model.DocumentKeywords;
import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.converting.instruction.instructions.DomainLink;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
@ -60,7 +60,7 @@ public class ConversionLog implements AutoCloseable, Interpreter {
} }
@Override @Override
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) {} public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {}
@Override @Override
public void loadDomainRedirect(DomainLink link) {} public void loadDomainRedirect(DomainLink link) {}

View File

@ -3,10 +3,10 @@ package nu.marginalia.converting;
import com.github.luben.zstd.ZstdOutputStream; import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson; import com.google.gson.Gson;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.instruction.Interpreter;
import nu.marginalia.model.crawl.DocumentKeywords; import nu.marginalia.converting.model.DocumentKeywords;
import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.converting.instruction.instructions.DomainLink;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
@ -121,7 +121,7 @@ public class InstructionWriter {
} }
@Override @Override
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) { public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {
} }
@Override @Override

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.compiler; package nu.marginalia.converting.compiler;
import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.model.crawl.DocumentKeywords; import nu.marginalia.converting.model.DocumentKeywords;
import nu.marginalia.converting.instruction.instructions.LoadKeywords; import nu.marginalia.converting.instruction.instructions.LoadKeywords;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
@ -39,7 +39,7 @@ public class DocumentsCompiler {
var words = doc.words; var words = doc.words;
if (words != null) { if (words != null) {
ret.add(new LoadKeywords(doc.url, doc.details.metadata, new DocumentKeywords(words))); ret.add(new LoadKeywords(doc.url, doc.details.metadata, words.build()));
} }
} }

View File

@ -2,7 +2,6 @@ package nu.marginalia.converting.model;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.model.crawl.EdgePageDocumentFlags; import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.crawl.EdgePageWords;
import nu.marginalia.model.crawl.EdgeUrlState; import nu.marginalia.model.crawl.EdgeUrlState;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
@ -13,7 +12,7 @@ public class ProcessedDocument {
public EdgeUrl url; public EdgeUrl url;
public ProcessedDocumentDetails details; public ProcessedDocumentDetails details;
public EdgePageWords words; public DocumentKeywordsBuilder words;
public EdgeUrlState state; public EdgeUrlState state;
public String stateReason; public String stateReason;

View File

@ -3,7 +3,7 @@ package nu.marginalia.converting.model;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.model.crawl.EdgeHtmlStandard;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import javax.annotation.Nullable; import javax.annotation.Nullable;
@ -29,5 +29,5 @@ public class ProcessedDocumentDetails {
public List<EdgeUrl> linksExternal; public List<EdgeUrl> linksExternal;
public List<EdgeUrl> feedLinks; public List<EdgeUrl> feedLinks;
public EdgePageDocumentsMetadata metadata; public DocumentMetadata metadata;
} }

View File

@ -7,7 +7,7 @@ import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata; import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordRep;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.crawl.EdgePageWords; import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.language.statistics.TermFrequencyDict; import nu.marginalia.language.statistics.TermFrequencyDict;
import javax.inject.Inject; import javax.inject.Inject;
@ -33,7 +33,7 @@ public class DocumentKeywordExtractor {
} }
public EdgePageWords extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) { public DocumentKeywordsBuilder extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
List<WordRep> titleWords = extractTitleWords(documentLanguageData); List<WordRep> titleWords = extractTitleWords(documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2); List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
@ -45,7 +45,7 @@ public class DocumentKeywordExtractor {
List<String> artifacts = getArtifacts(documentLanguageData); List<String> artifacts = getArtifacts(documentLanguageData);
WordsBuilder wordsBuilder = new WordsBuilder(); FilteringDocumentKeywordsBuilder wordsBuilder = new FilteringDocumentKeywordsBuilder();
createWords(wordsBuilder, keywordMetadata, titleWords, 0); createWords(wordsBuilder, keywordMetadata, titleWords, 0);
artifacts.forEach(wordsBuilder::addWithBlankMetadata); artifacts.forEach(wordsBuilder::addWithBlankMetadata);
@ -53,7 +53,7 @@ public class DocumentKeywordExtractor {
return wordsBuilder.build(); return wordsBuilder.build();
} }
public EdgePageWords extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) { public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
List<WordRep> titleWords = extractTitleWords(documentLanguageData); List<WordRep> titleWords = extractTitleWords(documentLanguageData);
@ -71,7 +71,7 @@ public class DocumentKeywordExtractor {
List<String> artifacts = getArtifacts(documentLanguageData); List<String> artifacts = getArtifacts(documentLanguageData);
WordsBuilder wordsBuilder = new WordsBuilder(); FilteringDocumentKeywordsBuilder wordsBuilder = new FilteringDocumentKeywordsBuilder();
createWords(wordsBuilder, keywordMetadata, titleWords, 0); createWords(wordsBuilder, keywordMetadata, titleWords, 0);
createWords(wordsBuilder, keywordMetadata, wordsTfIdf, EdgePageWordFlags.TfIdfHigh.asBit()); createWords(wordsBuilder, keywordMetadata, wordsTfIdf, EdgePageWordFlags.TfIdfHigh.asBit());
@ -143,7 +143,7 @@ public class DocumentKeywordExtractor {
} }
private void getSimpleWords(WordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) { private void getSimpleWords(FilteringDocumentKeywordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) {
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class); EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
@ -207,10 +207,10 @@ public class DocumentKeywordExtractor {
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public void createWords(WordsBuilder wordsBuilder, public void createWords(FilteringDocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata, KeywordMetadata metadata,
Collection<WordRep> words, Collection<WordRep> words,
long additionalMeta) { long additionalMeta) {
for (var word : words) { for (var word : words) {
@ -223,8 +223,8 @@ public class DocumentKeywordExtractor {
} }
} }
private static class WordsBuilder { private static class FilteringDocumentKeywordsBuilder {
private final EdgePageWords words = new EdgePageWords(1600); private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder(1600);
private final Set<String> seen = new HashSet<>(1600); private final Set<String> seen = new HashSet<>(1600);
public void add(String word, long meta) { public void add(String word, long meta) {
@ -238,7 +238,7 @@ public class DocumentKeywordExtractor {
} }
} }
public EdgePageWords build() { public DocumentKeywordsBuilder build() {
return words; return words;
} }

View File

@ -5,7 +5,7 @@ import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.LanguageFilter; import nu.marginalia.language.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.model.crawl.EdgeHtmlStandard;
import nu.marginalia.model.crawl.EdgePageWords; import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.model.ProcessedDocumentDetails;
@ -31,7 +31,7 @@ public abstract class AbstractDocumentProcessorPlugin {
protected static class MetaTagsBuilder { protected static class MetaTagsBuilder {
private final Set<String> tagWords = new HashSet<>(); private final Set<String> tagWords = new HashSet<>();
public void build(EdgePageWords dest) { public void build(DocumentKeywordsBuilder dest) {
dest.addAllSyntheticTerms(tagWords); dest.addAllSyntheticTerms(tagWords);
} }
@ -84,5 +84,5 @@ public abstract class AbstractDocumentProcessorPlugin {
public record DetailsWithWords(ProcessedDocumentDetails details, public record DetailsWithWords(ProcessedDocumentDetails details,
EdgePageWords words) {} DocumentKeywordsBuilder words) {}
} }

View File

@ -10,8 +10,8 @@ import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.model.crawl.EdgeHtmlStandard;
import nu.marginalia.model.crawl.EdgePageDocumentFlags; import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.crawl.EdgePageWords; import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.converting.processor.logic.*; import nu.marginalia.converting.processor.logic.*;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
@ -121,9 +121,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.hashCode = dld.localitySensitiveHashCode(); ret.hashCode = dld.localitySensitiveHashCode();
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
ret.metadata = new EdgePageDocumentsMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class)); ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class));
EdgePageWords words = keywordExtractor.extractKeywords(dld, keywordMetadata); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, keywordMetadata);
new MetaTagsBuilder() new MetaTagsBuilder()
.addDomainCrawlData(crawledDomain) .addDomainCrawlData(crawledDomain)
@ -173,7 +173,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
} }
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWords words) { private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, DocumentKeywordsBuilder words) {
final LinkProcessor lp = new LinkProcessor(ret, baseUrl); final LinkProcessor lp = new LinkProcessor(ret, baseUrl);
@ -208,7 +208,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
createFileLinkKeywords(words, lp, domain); createFileLinkKeywords(words, lp, domain);
} }
private void createFileLinkKeywords(EdgePageWords words, LinkProcessor lp, EdgeDomain domain) { private void createFileLinkKeywords(DocumentKeywordsBuilder words, LinkProcessor lp, EdgeDomain domain) {
Set<String> fileKeywords = new HashSet<>(100); Set<String> fileKeywords = new HashSet<>(100);
for (var link : lp.getNonIndexableUrls()) { for (var link : lp.getNonIndexableUrls()) {
@ -241,7 +241,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
fileKeywords.add(filename.replace(' ', '_')); fileKeywords.add(filename.replace(' ', '_'));
} }
private void createLinkKeywords(EdgePageWords words, LinkProcessor lp) { private void createLinkKeywords(DocumentKeywordsBuilder words, LinkProcessor lp) {
final Set<String> linkTerms = new HashSet<>(); final Set<String> linkTerms = new HashSet<>();
for (var fd : lp.getForeignDomains()) { for (var fd : lp.getForeignDomains()) {

View File

@ -9,8 +9,8 @@ import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.crawl.EdgeHtmlStandard; import nu.marginalia.model.crawl.EdgeHtmlStandard;
import nu.marginalia.model.crawl.EdgePageDocumentFlags; import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.crawl.EdgePageWords; import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.model.ProcessedDocumentDetails;
@ -89,10 +89,10 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1)); final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
ret.metadata = new EdgePageDocumentsMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText)); ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText));
KeywordMetadata keywordMetadata = new KeywordMetadata(); KeywordMetadata keywordMetadata = new KeywordMetadata();
EdgePageWords words = keywordExtractor.extractKeywords(dld, keywordMetadata); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, keywordMetadata);
new MetaTagsBuilder() new MetaTagsBuilder()
.addDomainCrawlData(crawledDomain) .addDomainCrawlData(crawledDomain)

View File

@ -12,7 +12,7 @@ import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.model.WordSeparator; import nu.marginalia.language.model.WordSeparator;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.EdgePageWordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.test.util.TestLanguageModels; import nu.marginalia.test.util.TestLanguageModels;
import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Pair;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
@ -128,14 +128,14 @@ class SentenceExtractorTest {
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath()))); var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata()); var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata());
var terms = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i)))) var terms = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new WordMetadata(newRes.metadata.get(i))))
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf())) .sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
.limit(100) .limit(100)
.map(Pair::getKey) .map(Pair::getKey)
.toArray(String[]::new); .toArray(String[]::new);
System.out.println(Arrays.toString(terms)); System.out.println(Arrays.toString(terms));
var terms2 = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i)))) var terms2 = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new WordMetadata(newRes.metadata.get(i))))
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf())) .sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
.filter(e -> e.getValue().hasFlag(EdgePageWordFlags.Subjects)) .filter(e -> e.getValue().hasFlag(EdgePageWordFlags.Subjects))
.limit(100) .limit(100)

View File

@ -0,0 +1,4 @@
# Crawl Job Extractor
The crawl job extractor creates a file containing a list of domains
along with known URLs. This is consumed by the [crawling-process](../crawling-process).

View File

@ -15,7 +15,7 @@ java {
dependencies { dependencies {
implementation project(':third-party') implementation project(':third-party')
implementation project(':common:model') implementation project(':common:model')
implementation project(':libraries:misc') implementation project(':libraries:big-string')
implementation project(':api:index-api') implementation project(':api:index-api')
implementation project(':common:service-discovery') implementation project(':common:service-discovery')
implementation project(':common:service-client') implementation project(':common:service-client')

View File

@ -0,0 +1,4 @@
# Crawling Models
Contains models shared by the [crawling-process](../crawling-process/) and
[converting-process](../converting-process/).

View File

@ -21,7 +21,7 @@ dependencies {
implementation project(':common:model') implementation project(':common:model')
implementation project(':common:config') implementation project(':common:config')
implementation project(':common:service') implementation project(':common:service')
implementation project(':libraries:misc') implementation project(':libraries:big-string')
implementation project(':api:index-api') implementation project(':api:index-api')
implementation project(':common:service-discovery') implementation project(':common:service-discovery')
implementation project(':common:service-client') implementation project(':common:service-client')

View File

@ -16,7 +16,7 @@ dependencies {
implementation project(':common:model') implementation project(':common:model')
implementation project(':common:config') implementation project(':common:config')
implementation project(':common:service') implementation project(':common:service')
implementation project(':libraries:misc') implementation project(':libraries:big-string')
implementation project(':api:index-api') implementation project(':api:index-api')
implementation project(':common:service-discovery') implementation project(':common:service-discovery')
implementation project(':common:service-client') implementation project(':common:service-client')

View File

@ -0,0 +1,3 @@
# Experimental
Contains tools for running classification experiments on crawl data.

View File

@ -2,9 +2,8 @@ package nu.marginalia.loading.loader;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.client.Context; import nu.marginalia.converting.model.DocumentKeywords;
import nu.marginalia.model.crawl.DocumentKeywords;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId; import nu.marginalia.model.id.EdgeId;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -19,7 +18,7 @@ public class IndexLoadKeywords implements Runnable {
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32); private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
private final LoaderIndexJournalWriter client; private final LoaderIndexJournalWriter client;
private record InsertTask(int urlId, int domainId, EdgePageDocumentsMetadata metadata, DocumentKeywords wordSet) {} private record InsertTask(int urlId, int domainId, DocumentMetadata metadata, DocumentKeywords wordSet) {}
private final Thread runThread; private final Thread runThread;
@ -50,7 +49,7 @@ public class IndexLoadKeywords implements Runnable {
} }
} }
public void load(LoaderData loaderData, EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) throws InterruptedException { public void load(LoaderData loaderData, EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) throws InterruptedException {
int domainId = loaderData.getDomainId(url.domain); int domainId = loaderData.getDomainId(url.domain);
int urlId = loaderData.getUrlId(url); int urlId = loaderData.getUrlId(url);

View File

@ -1,6 +1,7 @@
package nu.marginalia.util; package nu.marginalia.loading.loader;
import nu.marginalia.model.crawl.DocumentKeywords;
import nu.marginalia.converting.model.DocumentKeywords;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;

View File

@ -3,9 +3,9 @@ package nu.marginalia.loading.loader;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.converting.instruction.Interpreter;
import nu.marginalia.model.crawl.DocumentKeywords; import nu.marginalia.converting.model.DocumentKeywords;
import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.converting.instruction.instructions.DomainLink;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
@ -103,7 +103,7 @@ public class Loader implements Interpreter {
} }
@Override @Override
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) { public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {
// This is a bit of a bandaid safeguard against a bug in // This is a bit of a bandaid safeguard against a bug in
// in the converter, shouldn't be necessary in the future // in the converter, shouldn't be necessary in the future
if (!deferredDomains.isEmpty()) { if (!deferredDomains.isEmpty()) {

View File

@ -3,7 +3,6 @@ package nu.marginalia.loading.loader;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.dict.DictionaryMap;
import nu.marginalia.dict.OffHeapDictionaryHashMap; import nu.marginalia.dict.OffHeapDictionaryHashMap;
import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
@ -11,9 +10,8 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
import nu.marginalia.model.crawl.DocumentKeywords; import nu.marginalia.converting.model.DocumentKeywords;
import nu.marginalia.util.KeywordListChunker; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId; import nu.marginalia.model.id.EdgeId;
@ -40,7 +38,7 @@ public class LoaderIndexJournalWriter {
} }
public void putWords(EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url, public void putWords(EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
EdgePageDocumentsMetadata metadata, DocumentMetadata metadata,
DocumentKeywords wordSet) { DocumentKeywords wordSet) {
if (wordSet.keywords().length == 0) if (wordSet.keywords().length == 0)
return; return;

View File

@ -95,6 +95,7 @@ services:
- search-service - search-service
networks: networks:
wmsa: wmsa:
screenshot:
volumes: volumes:
db: db:
driver: local driver: local

View File

@ -0,0 +1,15 @@
# Domain Ranking
Contains domain ranking algorithms.
## Central Classes
### Algorithms
* [RankingAlgorithm](src/main/java/nu/marginalia/ranking/RankingAlgorithm.java)
* [StandardPageRank](src/main/java/nu/marginalia/ranking/StandardPageRank.java)
* [ReversePageRank](src/main/java/nu/marginalia/ranking/ReversePageRank.java) "CheiRank"
### Data sources
* [RankingDomainFetcher](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java) fetches link data.
* [RankingDomainFetcherForSimilarityData](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java) fetches website similarity data.

View File

@ -0,0 +1,10 @@
# Query Parser
End-user search query parsing tools used by the [search-service](../../services-core/search-service).
## Central Classes
* [QueryTokenizer](src/main/java/nu/marginalia/query_parser/QueryTokenizer.java)
* [QueryParser](src/main/java/nu/marginalia/query_parser/QueryParser.java)
* [QueryPermutations](src/main/java/nu/marginalia/query_parser/QueryVariants.java) - here be dragons
* [QueryVariants](src/main/java/nu/marginalia/query_parser/QueryVariants.java) - here be dragons

View File

@ -1,12 +1,11 @@
# Features # Features
These are bits of code that are relatively isolated pieces of business logic, These are bits of search-engine related code that are relatively isolated pieces of business logic,
that benefit from the clarity of being kept separate from the rest of the that benefit from the clarity of being kept separate from the rest of the
search engine code. search engine code.
* [domain-ranking](domain-ranking/) contains ranking algorithms. * [domain-ranking](domain-ranking/) contains ranking algorithms.
* [query-parser](query-parser/) contains code for parsing the user-facing query grammar. * [query-parser](query-parser/) contains code for parsing the user-facing query grammar.
* [renderer](renderer/) contains utility code for rendering website templates.
* [screenshots](screenshots/) and [random-websites](random-websites/) contains SQL queries random * [screenshots](screenshots/) and [random-websites](random-websites/) contains SQL queries random
exploration mode. exploration mode.

View File

@ -5,7 +5,7 @@ import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.ranking.DomainRankings; import nu.marginalia.ranking.DomainRankings;
import org.roaringbitmap.IntConsumer; import org.roaringbitmap.IntConsumer;
import org.roaringbitmap.RoaringBitmap; import org.roaringbitmap.RoaringBitmap;
@ -72,7 +72,7 @@ public class ForwardIndexConverter {
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.urlId()); long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.urlId());
int ranking = domainRankings.getRanking(entry.domainId()); int ranking = domainRankings.getRanking(entry.domainId());
long meta = EdgePageDocumentsMetadata.encodeRank(entry.docMeta(), ranking); long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId()); docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());

View File

@ -1,6 +1,6 @@
package nu.marginalia.index.forward; package nu.marginalia.index.forward;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.index.query.IndexQueryParams; import nu.marginalia.index.query.IndexQueryParams;
import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.query.filter.QueryFilterStepIf;
@ -52,7 +52,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
return true; return true;
} }
final int quality = EdgePageDocumentsMetadata.decodeQuality(post.meta()); final int quality = DocumentMetadata.decodeQuality(post.meta());
return limit.test(quality); return limit.test(quality);
} }
@ -61,7 +61,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
if (params.year().type() == SpecificationLimitType.NONE) if (params.year().type() == SpecificationLimitType.NONE)
return true; return true;
int postVal = EdgePageDocumentsMetadata.decodeYear(post.meta()); int postVal = DocumentMetadata.decodeYear(post.meta());
return params.year().test(postVal); return params.year().test(postVal);
} }
@ -70,7 +70,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
if (params.size().type() == SpecificationLimitType.NONE) if (params.size().type() == SpecificationLimitType.NONE)
return true; return true;
int postVal = EdgePageDocumentsMetadata.decodeSize(post.meta()); int postVal = DocumentMetadata.decodeSize(post.meta());
return params.size().test(postVal); return params.size().test(postVal);
} }
@ -79,7 +79,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
if (params.rank().type() == SpecificationLimitType.NONE) if (params.rank().type() == SpecificationLimitType.NONE)
return true; return true;
int postVal = EdgePageDocumentsMetadata.decodeRank(post.meta()); int postVal = DocumentMetadata.decodeRank(post.meta());
return params.rank().test(postVal); return params.rank().test(postVal);
} }

View File

@ -0,0 +1,17 @@
# Index Journal
The index journal contains a list of entries with keywords and keyword metadata per document.
This journal is written by [crawl/loading-process](../../crawl/loading-process) and read
when constructing the [forward](../index-forward) and [reverse](../index-reverse)
indices.
## Central Classes
### Model
* [IndexJournalEntry](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java)
* [IndexJournalEntryHeader](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java)
* [IndexJournalEntryData](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java)
### I/O
* [IndexJournalReader](src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java)
* [IndexJournalWriter](src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriter.java)

View File

@ -14,7 +14,7 @@ java {
dependencies { dependencies {
implementation project(':libraries:array') implementation project(':libraries:array')
implementation project(':libraries:btree') implementation project(':libraries:btree')
implementation project(':libraries:misc') implementation project(':libraries:random-write-funnel')
implementation project(':features:domain-ranking') implementation project(':features:domain-ranking')
implementation project(':index:index-query') implementation project(':index:index-query')
implementation project(':index:index-journal') implementation project(':index:index-journal')

View File

@ -5,7 +5,7 @@ import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalStatistics; import nu.marginalia.index.journal.model.IndexJournalStatistics;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.ranking.DomainRankings; import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.util.RandomWriteFunnel; import nu.marginalia.rwf.RandomWriteFunnel;
import nu.marginalia.array.IntArray; import nu.marginalia.array.IntArray;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.SortingContext; import nu.marginalia.array.algo.SortingContext;

View File

@ -2,7 +2,6 @@ package nu.marginalia.index.reverse;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.dict.OffHeapDictionaryHashMap;
import nu.marginalia.index.journal.model.IndexJournalEntry; import nu.marginalia.index.journal.model.IndexJournalEntry;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
@ -11,7 +10,7 @@ import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.ranking.DomainRankings; import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.test.TestUtil; import nu.marginalia.test.TestUtil;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -56,7 +55,7 @@ class ReverseIndexConverterTest {
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
var entryBuilder = IndexJournalEntry.builder(id, EdgePageDocumentsMetadata.defaultValue()); var entryBuilder = IndexJournalEntry.builder(id, DocumentMetadata.defaultValue());
for (int i = 0; i < factors.length; i++) { for (int i = 0; i < factors.length; i++) {
entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i]); entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i]);

View File

@ -2,7 +2,6 @@ package nu.marginalia.index.reverse;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.dict.OffHeapDictionaryHashMap;
import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;

View File

@ -21,6 +21,7 @@ dependencies {
implementation libs.prometheus implementation libs.prometheus
implementation libs.guava implementation libs.guava
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit

15
index/lexicon/readme.md Normal file
View File

@ -0,0 +1,15 @@
# Lexicon
The lexicon contains a mapping for words to identifiers. This lexicon is populated from a journal.
The actual word data isn't mapped, but rather a 64 bit hash.
The lexicon is written by [crawl/loading-process](../../crawl/loading-process) and read when
[services-core/index-service](../../services-core/index-service) interprets queries.
## Central Classes
* [KeywordLexicon](src/main/java/nu/marginalia/lexicon/KeywordLexicon.java)
* [KeywordLexiconJournal](src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java)
* [DictionaryMap](src/main/java/nu/marginalia/dict/DictionaryMap.java) comes in two versions
* * [OnHeapDictionaryMap](src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java) - basically just a fastutil Long2IntOpenHashMap
* * [OffHeapDictionaryHashMap](src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java) - a heavily modified trove TLongIntHashMap that uses off heap memory

View File

@ -19,8 +19,6 @@ public interface IntArraySearch extends IntArrayBase {
return LongArraySearch.encodeSearchMiss(pos - 1); return LongArraySearch.encodeSearchMiss(pos - 1);
} }
default long binarySearch(int key, long fromIndex, long toIndex) { default long binarySearch(int key, long fromIndex, long toIndex) {
long low = 0; long low = 0;
long high = (toIndex - fromIndex) - 1; long high = (toIndex - fromIndex) - 1;

View File

@ -0,0 +1,26 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.lz4
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}

View File

@ -0,0 +1,23 @@
# Big String
Microlibrary that offers string compression. This is useful when having to load tens of thousands
of HTML documents in memory during conversion. XML has been described as the opposite of a compression scheme,
and as a result, HTML compresses ridiculously well.
## Demo
```java
List<BigString> manyBigStrings = new ArrayList<>();
for (var file : files) {
// BigString.encode may or may not compress the string
// depeneding on its size
manyBigStrings.add(BigString.encode(readFile(file)));
}
for (var bs : manyBigStrings) {
String decompressedString = bs.decompress();
byte[] bytes = bs.getBytes();
int len = bs.getLength();
}
```

View File

@ -0,0 +1,23 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}

View File

@ -0,0 +1,66 @@
# Easy LSH
This a simple [Locality-Sensitive Hash](https://en.wikipedia.org/wiki/Locality-sensitive_hashing)
for document deduplication. Hashes are compared using their hamming distance.
## Central Classes
* [EasyLSH](src/main/java/nu/marginalia/lsh/EasyLSH.java)
## Demo
Consider statistical distribution only
```java
var lsh1 = new EasyLSH();
lsh1.addUnordered("lorem");
lsh1.addUnordered("ipsum");
lsh1.addUnordered("dolor");
lsh1.addUnordered("sit");
lsh1.addUnordered("amet");
long hash1 = lsh1.get();
var lsh2 = new EasyLSH();
lsh2.addUnordered("amet");
lsh2.addUnordered("ipsum");
lsh2.addUnordered("lorem");
lsh2.addUnordered("dolor");
lsh2.addUnordered("SEAT");
long hash2 = lsh2.get();
System.out.println(EasyLSH.hammingDistance(lsh1, lsh2));
// 1 -- these are similar
```
Consider order as well as distribution
```java
var lsh1 = new EasyLSH();
lsh1.addOrdered("lorem");
lsh1.addOrdered("ipsum");
lsh1.addOrdered("dolor");
lsh1.addOrdered("sit");
lsh1.addOrdered("amet");
long hash1 = lsh1.get();
var lsh2 = new EasyLSH();
lsh2.addOrdered("amet");
lsh2.addOrdered("ipsum");
lsh2.addOrdered("lorem");
lsh2.addOrdered("dolor");
lsh2.addOrdered("SEAT");
long hash2 = lsh2.get();
System.out.println(EasyLSH.hammingDistance(lsh1, lsh2));
// 5 -- these are not very similar
// note the value is relatively low because there are few words
// and there simply can't be very many differences
// it will approach 32 as documents grow larger
```

View File

@ -18,8 +18,6 @@ public class EasyLSH {
private static final int SHINGLING = 2; private static final int SHINGLING = 2;
static { assert Integer.bitCount(SHINGLING) == 1; } static { assert Integer.bitCount(SHINGLING) == 1; }
private final int[] fields = new int[64]; private final int[] fields = new int[64];
private final int[] prevHashes = new int[SHINGLING]; private final int[] prevHashes = new int[SHINGLING];
private int prevHashIdx = 0; private int prevHashIdx = 0;
@ -37,7 +35,7 @@ public class EasyLSH {
} }
public void addHashUnordered(int hashCode) { public void addHashUnordered(int hashCode) {
int value = 1- (hashCode & 2); int value = 1 - (hashCode & 2);
// Try to extract all the remaining entropy // Try to extract all the remaining entropy
// into selecting the field to update // into selecting the field to update

View File

@ -6,6 +6,28 @@ import org.junit.jupiter.api.Test;
import java.util.Arrays; import java.util.Arrays;
class EasyLSHTest { class EasyLSHTest {
@Test
public void testDemo() {
var lsh1 = new EasyLSH();
lsh1.addOrdered("lorem");
lsh1.addOrdered("ipsum");
lsh1.addOrdered("dolor");
lsh1.addOrdered("sit");
lsh1.addOrdered("amet");
long hash1 = lsh1.get();
var lsh2 = new EasyLSH();
lsh2.addOrdered("amet");
lsh2.addOrdered("ipsum");
lsh2.addOrdered("lorem");
lsh2.addOrdered("dolor");
lsh2.addOrdered("SEAT");
long hash2 = lsh2.get();
System.out.println(EasyLSH.hammingDistance(lsh1, lsh2));
}
@Test @Test
public void testEZLSH() { public void testEZLSH() {

View File

@ -0,0 +1,25 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.notnull
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}

View File

@ -0,0 +1,37 @@
# Guarded Regex
This is a simple library for creating guarded regular expressions. Pattern matching in Java
is pretty slow even with compiled regular expressions.
Guarding them with a `startsWith()`, `endsWith()` or `contains()` can be an order of magnitude
faster, but leads to an unfortunate spreading out of the logic across the pattern and the guard
condition.
Guarded regexes aims to fix this. Instead of code like
```java
Pattern pattern = Pattern.compile("[123]?foo(bar|baz){2,5}");
void ifTheThingDoTheThing(String str) {
if (str.contains("foo") && pattern.matcher(str).matches()) {
doTheThing();
}
}
```
you get the more expressive variant
```java
GuardedRegex thingPredicate =
GuardedRegexFactory.contains("foo", "[123]?foo(bar|baz){2,5}");
void ifTheThingDoTheThing(String str) {
if (thingPredicate.test(str)) {
doTheThing();
}
}
```
## Central Classes
* [GuardedRegexFactory](src/main/java/nu/marginalia/gregex/GuardedRegexFactory.java)

View File

@ -18,7 +18,7 @@ dependencies {
implementation project(':third-party') implementation project(':third-party')
implementation project(':common:model') implementation project(':common:model')
implementation project(':common:config') implementation project(':common:config')
implementation project(':libraries:misc') implementation project(':libraries:easy-lsh')
implementation libs.lombok implementation libs.lombok
annotationProcessor libs.lombok annotationProcessor libs.lombok

View File

@ -1,6 +1,6 @@
package nu.marginalia.language.model; package nu.marginalia.language.model;
import nu.marginalia.model.idx.EdgePageWordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.crawl.EdgePageWordFlags;
import java.util.EnumSet; import java.util.EnumSet;
@ -44,7 +44,7 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
int positions = positionMask.getOrDefault(stemmed, 0); int positions = positionMask.getOrDefault(stemmed, 0);
return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, tfidf.count(), flags).encode(); return new WordMetadata(tfidf.tfIdfNormalized(), positions, tfidf.count(), flags).encode();
} }
} }

View File

@ -17,7 +17,7 @@ dependencies {
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.notnull implementation libs.notnull
implementation libs.lz4
implementation libs.fastutil implementation libs.fastutil
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test

View File

@ -0,0 +1,24 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}

View File

@ -0,0 +1,31 @@
# Random Write Funnel
This micro-library solves the problem of [write amplification](https://en.wikipedia.org/wiki/Write_amplification) when
writing large files out of order to disk. It does this by bucketing the writes into several temporary files,
which are then evaluated to construct the larger file with a more predictable order of writes.
Even though it effectively writes 2.5x as much data to disk than simply attempting to
construct the file directly, it is *much* faster than thrashing an SSD with dozens of gigabytes
of small random writes.
## Demo
```java
try (var rfw = new RandomWriteFunnel(tmpPath, expectedSize);
var out = Files.newByteChannel(outputFile, StandardOpenOption.WRITE))
{
rwf.put(addr1, data1);
rwf.put(addr2, data2);
// ...
rwf.put(addr1e33, data1e33);
rwf.write(out);
}
catch (IOException ex) {
//
}
```
## Central Classes
* [RandomWriteFunnel](src/main/java/nu/marginalia/rwf/RandomWriteFunnel.java)

View File

@ -1,4 +1,4 @@
package nu.marginalia.util; package nu.marginalia.rwf;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -1,33 +1,43 @@
package nu.marginalia.index.service.util; package nu.marginalia.rwf;
import nu.marginalia.util.RandomWriteFunnel; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
class RandomWriteFunnelTest { class RandomWriteFunnelTest {
Path testFile;
@BeforeEach
public void setUp() throws IOException {
testFile = Files.createTempFile(getClass().getSimpleName(), "bin");
}
@AfterEach
public void tearDown() throws IOException {
Files.delete(testFile);
}
@Test @Test
public void test() { public void test() {
new File("/tmp/test.bin").delete();
try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), 5001); try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), 5001);
var out = new RandomAccessFile("/tmp/test.bin", "rw")) { var out = Files.newByteChannel(testFile, StandardOpenOption.WRITE)) {
for (int i = 10_000-1; i >= 0; i--) { for (int i = 10_000-1; i >= 0; i--) {
System.out.println(i); System.out.println(i);
funnel.put(i, 10_000-i); funnel.put(i, 10_000-i);
} }
funnel.write(out.getChannel()); funnel.write(out);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }
try (var in = new RandomAccessFile("/tmp/test.bin", "r")) { try (var in = new RandomAccessFile(testFile.toFile(), "r")) {
for (int i = 0; i < 10_000; i++) { for (int i = 0; i < 10_000; i++) {
assertEquals(10_000-i, in.readLong()); assertEquals(10_000-i, in.readLong());
} }
@ -38,20 +48,19 @@ class RandomWriteFunnelTest {
@Test @Test
public void testSparse() { public void testSparse() {
new File("/tmp/test.bin").delete();
for (int j = 1; j <= 20; j++) { for (int j = 1; j <= 20; j++) {
try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), j); try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), j);
var out = new RandomAccessFile("/tmp/test.bin", "rw")) { var out = Files.newByteChannel(testFile, StandardOpenOption.WRITE)) {
for (int i = 10 - 1; i >= 0; i -= 2) { for (int i = 10 - 1; i >= 0; i -= 2) {
funnel.put(i, 10 - i); funnel.put(i, 10 - i);
} }
funnel.write(out.getChannel()); funnel.write(out);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }
try (var in = new RandomAccessFile("/tmp/test.bin", "r")) { try (var in = new RandomAccessFile(testFile.toFile(), "r")) {
assertEquals(0, in.readLong()); assertEquals(0, in.readLong());
assertEquals(9, in.readLong()); assertEquals(9, in.readLong());
assertEquals(0, in.readLong()); assertEquals(0, in.readLong());
@ -71,20 +80,19 @@ class RandomWriteFunnelTest {
@Test @Test
public void testYuge() { public void testYuge() {
new File("/tmp/test.bin").delete();
for (int j = 1; j <= 20; j++) { for (int j = 1; j <= 20; j++) {
try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), j); try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), j);
var out = new RandomAccessFile("/tmp/test.bin", "rw")) { var out = Files.newByteChannel(testFile, StandardOpenOption.WRITE)) {
for (int i = 10 - 1; i >= 0; i -= 2) { for (int i = 10 - 1; i >= 0; i -= 2) {
funnel.put(i, Long.MAX_VALUE - i); funnel.put(i, Long.MAX_VALUE - i);
} }
funnel.write(out.getChannel()); funnel.write(out);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }
try (var in = new RandomAccessFile("/tmp/test.bin", "r")) { try (var in = new RandomAccessFile(testFile.toFile(), "r")) {
in.readLong(); in.readLong();
in.readLong(); in.readLong();
in.readLong(); in.readLong();

View File

@ -6,4 +6,15 @@ These are libraries that are not strongly coupled to the search engine.
bad support for. It's designed to be able to easily replaced when *Java's Foreign Function And Memory API* is released. bad support for. It's designed to be able to easily replaced when *Java's Foreign Function And Memory API* is released.
* The [btree](btree/) library offers a static BTree implementation based on the array library. * The [btree](btree/) library offers a static BTree implementation based on the array library.
* [language-processing](language-processing/) contains primitives for sentence extraction and POS-tagging. * [language-processing](language-processing/) contains primitives for sentence extraction and POS-tagging.
## Micro libraries
* [easy-lsh](easy-lsh/) is a simple locality-sensitive hash for document deduplication
* [guarded-regex](guarded-regex/) makes predicated regular expressions clearer
* [big-string](big-string/) offers seamless string compression
* [random-write-funnel](random-write-funnel/) is a tool for reducing write amplification when constructing
large files out of order.
## The rest
* [misc](misc/) is just random bits and bobs that didn't fit anywhere. * [misc](misc/) is just random bits and bobs that didn't fit anywhere.

5
other/readme.md Normal file
View File

@ -0,0 +1,5 @@
# Other
This code will be moved to a separate repository.
Nothing to see here, move along.

View File

@ -7,7 +7,7 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import nu.marginalia.index.svc.SearchTermsService; import nu.marginalia.index.svc.SearchTermsService;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.EdgePageWordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem; import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore; import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
@ -175,17 +175,17 @@ public class IndexResultValuator {
return 1000; return 1000;
} }
positions = EdgePageWordMetadata.decodePositions(meta); positions = WordMetadata.decodePositions(meta);
maskDirectRaw &= positions; maskDirectRaw &= positions;
if (positions != 0 && !EdgePageWordMetadata.hasAnyFlags(meta, flagBitMask)) { if (positions != 0 && !WordMetadata.hasAnyFlags(meta, flagBitMask)) {
maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); maskAdjacent &= (positions | (positions << 1) | (positions >>> 1));
maskDirectGenerous &= positions; maskDirectGenerous &= positions;
} }
termCount++; termCount++;
tfIdfSum += EdgePageWordMetadata.decodeTfidf(meta); tfIdfSum += WordMetadata.decodeTfidf(meta);
} }
double avgTfIdf = termCount / tfIdfSum; double avgTfIdf = termCount / tfIdfSum;

View File

@ -0,0 +1,77 @@
package nu.marginalia.index.model;
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import org.junit.jupiter.api.Test;
import java.util.EnumSet;
import static org.junit.jupiter.api.Assertions.assertEquals;
class DocumentMetadataTest {
@Test
public void codecYear() {
var meta = new DocumentMetadata(0, 0, 0, 192, 0, 0, (byte) 0);
long encoded = meta.encode();
var decoded = new DocumentMetadata(encoded);
assertEquals(192, decoded.year());
}
@Test
public void codecTopology() {
var meta = new DocumentMetadata(0, 0, 192, 0, 0, 0, (byte) 0);
long encoded = meta.encode();
var decoded = new DocumentMetadata(encoded);
assertEquals(192, decoded.topology());
}
@Test
public void codecSets() {
var meta = new DocumentMetadata(0, 0, 0, 0, 14, 0, (byte) 0);
long encoded = meta.encode();
var decoded = new DocumentMetadata(encoded);
assertEquals(14, decoded.sets());
}
@Test
public void codecQuality() {
var meta = new DocumentMetadata(0, 0, 0, 0, 0, 9, (byte) 0);
long encoded = meta.encode();
var decoded = new DocumentMetadata(encoded);
assertEquals(9, decoded.quality());
}
@Test
public void codecFlags() {
var meta = new DocumentMetadata(0, 0, 0, 0, 0, 0, (byte) 255);
long encoded = meta.encode();
System.out.println(Long.toHexString(encoded));
var decoded = new DocumentMetadata(encoded);
System.out.println(decoded);
assertEquals((byte) 255, decoded.flags());
}
@Test
public void encSize() {
assertEquals(100, new DocumentMetadata(0).withSize(145).size());
assertEquals(100, DocumentMetadata.decodeSize(new DocumentMetadata(0).withSize(145).encode()));
assertEquals(50, new DocumentMetadata(0).withSize(4).size());
assertEquals(50, DocumentMetadata.decodeSize(new DocumentMetadata(0).withSize(4).encode()));
assertEquals(50 * 255, DocumentMetadata.decodeSize(new DocumentMetadata(0).withSize(Integer.MAX_VALUE).encode()));
assertEquals(50 * 255, new DocumentMetadata(0).withSize(Integer.MAX_VALUE).size());
}
@Test
public void encRank() {
var meta = new DocumentMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class))
.withSize(0xffffffff).encode();
var enc2 = DocumentMetadata.encodeRank(meta, 83);
assertEquals(83, DocumentMetadata.decodeRank(enc2));
assertEquals(5, DocumentMetadata.decodeTopology(enc2));
}
}

View File

@ -1,77 +0,0 @@
package nu.marginalia.index.model;
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
import org.junit.jupiter.api.Test;
import java.util.EnumSet;
import static org.junit.jupiter.api.Assertions.assertEquals;
class EdgePageDocumentsMetadataTest {
@Test
public void codecYear() {
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 192, 0, 0, (byte) 0);
long encoded = meta.encode();
var decoded = new EdgePageDocumentsMetadata(encoded);
assertEquals(192, decoded.year());
}
@Test
public void codecTopology() {
var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, 0, (byte) 0);
long encoded = meta.encode();
var decoded = new EdgePageDocumentsMetadata(encoded);
assertEquals(192, decoded.topology());
}
@Test
public void codecSets() {
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 14, 0, (byte) 0);
long encoded = meta.encode();
var decoded = new EdgePageDocumentsMetadata(encoded);
assertEquals(14, decoded.sets());
}
@Test
public void codecQuality() {
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 9, (byte) 0);
long encoded = meta.encode();
var decoded = new EdgePageDocumentsMetadata(encoded);
assertEquals(9, decoded.quality());
}
@Test
public void codecFlags() {
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 0, (byte) 255);
long encoded = meta.encode();
System.out.println(Long.toHexString(encoded));
var decoded = new EdgePageDocumentsMetadata(encoded);
System.out.println(decoded);
assertEquals((byte) 255, decoded.flags());
}
@Test
public void encSize() {
assertEquals(100, new EdgePageDocumentsMetadata(0).withSize(145).size());
assertEquals(100, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(145).encode()));
assertEquals(50, new EdgePageDocumentsMetadata(0).withSize(4).size());
assertEquals(50, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(4).encode()));
assertEquals(50 * 255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode()));
assertEquals(50 * 255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size());
}
@Test
public void encRank() {
var meta = new EdgePageDocumentsMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class))
.withSize(0xffffffff).encode();
var enc2 = EdgePageDocumentsMetadata.encodeRank(meta, 83);
assertEquals(83, EdgePageDocumentsMetadata.decodeRank(enc2));
assertEquals(5, EdgePageDocumentsMetadata.decodeTopology(enc2));
}
}

View File

@ -15,8 +15,8 @@ import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.EdgePageWordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.service.server.Initialization; import nu.marginalia.service.server.Initialization;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
@ -164,12 +164,12 @@ public class IndexQueryServiceIntegrationTest {
long fullId = id | ((long) (32 - (id % 32)) << 32); long fullId = id | ((long) (32 - (id % 32)) << 32);
var header = new IndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); var header = new IndexJournalEntryHeader(factors.length, fullId, new DocumentMetadata(0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
long[] data = new long[factors.length*2]; long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) { for (int i = 0; i < factors.length; i++) {
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
data[2*i + 1] = new EdgePageWordMetadata(i, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); data[2*i + 1] = new WordMetadata(i, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
} }
indexJournalWriter.put(header, new IndexJournalEntryData(data)); indexJournalWriter.put(header, new IndexJournalEntryData(data));
@ -177,12 +177,12 @@ public class IndexQueryServiceIntegrationTest {
public void loadDataWithDomain(int domain, int id) { public void loadDataWithDomain(int domain, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
var header = new IndexJournalEntryHeader(factors.length, id | ((long) domain << 32), EdgePageDocumentsMetadata.defaultValue()); var header = new IndexJournalEntryHeader(factors.length, id | ((long) domain << 32), DocumentMetadata.defaultValue());
long[] data = new long[factors.length*2]; long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) { for (int i = 0; i < factors.length; i++) {
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
data[2*i + 1] = new EdgePageWordMetadata(i % 20, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); data[2*i + 1] = new WordMetadata(i % 20, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
} }
indexJournalWriter.put(header, new IndexJournalEntryData(data)); indexJournalWriter.put(header, new IndexJournalEntryData(data));

View File

@ -26,6 +26,7 @@ dependencies {
implementation project(':index:index-query') implementation project(':index:index-query')
implementation project(':libraries:misc') implementation project(':libraries:misc')
implementation project(':libraries:easy-lsh')
implementation project(':libraries:language-processing') implementation project(':libraries:language-processing')
implementation project(':api:assistant-api') implementation project(':api:assistant-api')
@ -33,8 +34,8 @@ dependencies {
implementation project(':api:search-api') implementation project(':api:search-api')
implementation project(':common:service-discovery') implementation project(':common:service-discovery')
implementation project(':common:service-client') implementation project(':common:service-client')
implementation project(':common:renderer')
implementation project(':features:renderer')
implementation project(':features:screenshots') implementation project(':features:screenshots')
implementation project(':features:random-websites') implementation project(':features:random-websites')
implementation project(':features:query-parser') implementation project(':features:query-parser')

View File

@ -6,7 +6,6 @@ import gnu.trove.map.hash.TObjectIntHashMap;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.lsh.EasyLSH; import nu.marginalia.lsh.EasyLSH;
import nu.marginalia.util.BrailleBlockPunchCards;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

Some files were not shown because too many files have changed in this diff Show More