A lot of readmes, some refactoring.
This commit is contained in:
parent
f19c9a2863
commit
b945fd7f39
8
api/assistant-api/readme.md
Normal file
8
api/assistant-api/readme.md
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# Assistant API
|
||||||
|
|
||||||
|
Client and models for talking to the [assistant-service](../../services-core/assistant-service),
|
||||||
|
implemented with the base client from [service-client](../../common/service-client).
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [AssistantClient](src/main/java/nu/marginalia/assistant/client/AssistantClient.java)
|
8
api/index-api/readme.md
Normal file
8
api/index-api/readme.md
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# Index API
|
||||||
|
|
||||||
|
Client and models for talking to the [index-service](../../services-core/index-service),
|
||||||
|
implemented with the base client from [service-client](../../common/service-client).
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [IndexClient](src/main/java/nu/marginalia/index/client/IndexClient.java)
|
@ -18,12 +18,12 @@ import javax.annotation.CheckReturnValue;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class EdgeIndexClient extends AbstractDynamicClient {
|
public class IndexClient extends AbstractDynamicClient {
|
||||||
|
|
||||||
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
|
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeIndexClient(ServiceDescriptors descriptors) {
|
public IndexClient(ServiceDescriptors descriptors) {
|
||||||
super(descriptors.forId(ServiceId.Index), WmsaHome.getHostsFile(), GsonFactory::get);
|
super(descriptors.forId(ServiceId.Index), WmsaHome.getHostsFile(), GsonFactory::get);
|
||||||
|
|
||||||
setTimeout(30);
|
setTimeout(30);
|
@ -1,9 +1,9 @@
|
|||||||
package nu.marginalia.index.client.model.results;
|
package nu.marginalia.index.client.model.results;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
import nu.marginalia.model.idx.EdgePageWordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
|
||||||
import static java.lang.Integer.lowestOneBit;
|
import static java.lang.Integer.lowestOneBit;
|
||||||
import static java.lang.Integer.numberOfTrailingZeros;
|
import static java.lang.Integer.numberOfTrailingZeros;
|
||||||
@ -16,15 +16,15 @@ public record EdgeSearchResultKeywordScore(int set,
|
|||||||
public double documentValue() {
|
public double documentValue() {
|
||||||
long sum = 0;
|
long sum = 0;
|
||||||
|
|
||||||
sum += EdgePageDocumentsMetadata.decodeQuality(encodedDocMetadata) / 5.;
|
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
|
||||||
|
|
||||||
sum += EdgePageDocumentsMetadata.decodeTopology(encodedDocMetadata);
|
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
|
||||||
|
|
||||||
if (EdgePageDocumentsMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
|
if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
|
||||||
sum += 20;
|
sum += 20;
|
||||||
}
|
}
|
||||||
|
|
||||||
int rank = EdgePageDocumentsMetadata.decodeRank(encodedDocMetadata) - 13;
|
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
|
||||||
if (rank < 0)
|
if (rank < 0)
|
||||||
sum += rank / 2;
|
sum += rank / 2;
|
||||||
else
|
else
|
||||||
@ -34,7 +34,7 @@ public record EdgeSearchResultKeywordScore(int set,
|
|||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasTermFlag(EdgePageWordFlags flag) {
|
private boolean hasTermFlag(EdgePageWordFlags flag) {
|
||||||
return EdgePageWordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
||||||
}
|
}
|
||||||
|
|
||||||
public double termValue() {
|
public double termValue() {
|
||||||
@ -58,7 +58,7 @@ public record EdgeSearchResultKeywordScore(int set,
|
|||||||
sum -= 1;
|
sum -= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
sum -= EdgePageWordMetadata.decodeTfidf(encodedWordMetadata) / 50.;
|
sum -= WordMetadata.decodeTfidf(encodedWordMetadata) / 50.;
|
||||||
sum += firstPos() / 5.;
|
sum += firstPos() / 5.;
|
||||||
sum -= Integer.bitCount(positions()) / 3.;
|
sum -= Integer.bitCount(positions()) / 3.;
|
||||||
|
|
||||||
@ -66,9 +66,9 @@ public record EdgeSearchResultKeywordScore(int set,
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int firstPos() {
|
public int firstPos() {
|
||||||
return numberOfTrailingZeros(lowestOneBit(EdgePageWordMetadata.decodePositions(encodedWordMetadata)));
|
return numberOfTrailingZeros(lowestOneBit(WordMetadata.decodePositions(encodedWordMetadata)));
|
||||||
}
|
}
|
||||||
public int positions() { return EdgePageWordMetadata.decodePositions(encodedWordMetadata); }
|
public int positions() { return WordMetadata.decodePositions(encodedWordMetadata); }
|
||||||
public boolean isSpecial() { return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic); }
|
public boolean isSpecial() { return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic); }
|
||||||
public boolean isRegular() {
|
public boolean isRegular() {
|
||||||
return !keyword.contains(":")
|
return !keyword.contains(":")
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
# Core Service Clients
|
# Core Service Clients
|
||||||
|
|
||||||
These are clients for the [core services](../services-core/), along with what models
|
These are clients for the [core services](../services-core/), along with what models
|
||||||
are necessary for speaking to them.
|
are necessary for speaking to them. They each implement the abstract client classes from
|
||||||
|
[service-client](../common/service-client).
|
||||||
|
|
||||||
All that is necessary is to `@Inject` them into the constructor and then
|
All that is necessary is to `@Inject` them into the constructor and then
|
||||||
requests can be sent.
|
requests can be sent.
|
8
api/search-api/readme.md
Normal file
8
api/search-api/readme.md
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# Search API
|
||||||
|
|
||||||
|
Client and models for talking to the [search-service](../../services-core/search-service),
|
||||||
|
implemented with the base client from [service-client](../../common/service-client).
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [SearchClient](src/main/java/nu/marginalia/search/client/SearchClient.java)
|
@ -18,11 +18,11 @@ import java.net.URLEncoder;
|
|||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class EdgeSearchClient extends AbstractDynamicClient {
|
public class SearchClient extends AbstractDynamicClient {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeSearchClient(ServiceDescriptors descriptors) {
|
public SearchClient(ServiceDescriptors descriptors) {
|
||||||
super(descriptors.forId(ServiceId.Search), WmsaHome.getHostsFile(), GsonFactory::get);
|
super(descriptors.forId(ServiceId.Search), WmsaHome.getHostsFile(), GsonFactory::get);
|
||||||
}
|
}
|
||||||
|
|
3
common/config/readme.md
Normal file
3
common/config/readme.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Config
|
||||||
|
|
||||||
|
This package contains configuration injectables used by the services.
|
@ -13,7 +13,7 @@ java {
|
|||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':common:service-discovery')
|
implementation project(':common:service-discovery')
|
||||||
implementation project(':common:service-client')
|
implementation project(':common:service-client')
|
||||||
implementation project(':libraries:misc')
|
implementation project(':libraries:big-string')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
|
11
common/model/readme.md
Normal file
11
common/model/readme.md
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# Model
|
||||||
|
|
||||||
|
This package contains common models to the search engine
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [EdgeDomain](src/main/java/nu/marginalia/model/EdgeDomain.java)
|
||||||
|
* [EdgeUrl](src/main/java/nu/marginalia/model/EdgeUrl.java)
|
||||||
|
* [EdgeId](src/main/java/nu/marginalia/model/id/EdgeId.java)
|
||||||
|
* [DocumentMetadata](src/main/java/nu/marginalia/model/idx/DocumentMetadata.java)
|
||||||
|
* [WordMetadata](src/main/java/nu/marginalia/model/idx/WordMetadata.java)
|
@ -1,10 +0,0 @@
|
|||||||
package nu.marginalia.model.crawl;
|
|
||||||
|
|
||||||
import lombok.*;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
|
||||||
|
|
||||||
@AllArgsConstructor @EqualsAndHashCode @Getter @Setter @Builder @ToString
|
|
||||||
public class EdgeDomainLink {
|
|
||||||
public final EdgeDomain source;
|
|
||||||
public final EdgeDomain destination;
|
|
||||||
}
|
|
@ -9,13 +9,13 @@ import java.util.Set;
|
|||||||
import static java.lang.Math.max;
|
import static java.lang.Math.max;
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
public record EdgePageDocumentsMetadata(int rank,
|
public record DocumentMetadata(int rank,
|
||||||
int encSize,
|
int encSize,
|
||||||
int topology,
|
int topology,
|
||||||
int year,
|
int year,
|
||||||
int sets,
|
int sets,
|
||||||
int quality,
|
int quality,
|
||||||
byte flags) {
|
byte flags) {
|
||||||
|
|
||||||
|
|
||||||
public static final long RANK_MASK = 0xFFL;
|
public static final long RANK_MASK = 0xFFL;
|
||||||
@ -41,21 +41,21 @@ public record EdgePageDocumentsMetadata(int rank,
|
|||||||
public static long defaultValue() {
|
public static long defaultValue() {
|
||||||
return 0L;
|
return 0L;
|
||||||
}
|
}
|
||||||
public EdgePageDocumentsMetadata() {
|
public DocumentMetadata() {
|
||||||
this(defaultValue());
|
this(defaultValue());
|
||||||
}
|
}
|
||||||
public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
|
public DocumentMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
|
||||||
this(0, 0, topology, year, sets, quality, encodeFlags(flags));
|
this(0, 0, topology, year, sets, quality, encodeFlags(flags));
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageDocumentsMetadata withSize(int size) {
|
public DocumentMetadata withSize(int size) {
|
||||||
if (size <= 0) {
|
if (size <= 0) {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER));
|
final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER));
|
||||||
|
|
||||||
return new EdgePageDocumentsMetadata(rank, encSize, topology, year, sets, quality, flags);
|
return new DocumentMetadata(rank, encSize, topology, year, sets, quality, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
|
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
|
||||||
@ -68,7 +68,7 @@ public record EdgePageDocumentsMetadata(int rank,
|
|||||||
return (flags & flag.asBit()) != 0;
|
return (flags & flag.asBit()) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageDocumentsMetadata(long value) {
|
public DocumentMetadata(long value) {
|
||||||
this( (int) ((value >>> RANK_SHIFT) & RANK_MASK),
|
this( (int) ((value >>> RANK_SHIFT) & RANK_MASK),
|
||||||
(int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
|
(int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
|
||||||
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),
|
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),
|
@ -9,10 +9,10 @@ import java.util.Set;
|
|||||||
import static java.lang.Math.max;
|
import static java.lang.Math.max;
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
public record EdgePageWordMetadata(int tfIdf,
|
public record WordMetadata(int tfIdf,
|
||||||
int positions,
|
int positions,
|
||||||
int count,
|
int count,
|
||||||
byte flags) {
|
byte flags) {
|
||||||
|
|
||||||
public static final long COUNT_MASK = 0xFL;
|
public static final long COUNT_MASK = 0xFL;
|
||||||
public static final int COUNT_SHIFT = 8;
|
public static final int COUNT_SHIFT = 8;
|
||||||
@ -22,11 +22,11 @@ public record EdgePageWordMetadata(int tfIdf,
|
|||||||
|
|
||||||
public static final int POSITIONS_SHIFT = 32;
|
public static final int POSITIONS_SHIFT = 32;
|
||||||
|
|
||||||
public EdgePageWordMetadata() {
|
public WordMetadata() {
|
||||||
this(emptyValue());
|
this(emptyValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageWordMetadata(long value) {
|
public WordMetadata(long value) {
|
||||||
this(
|
this(
|
||||||
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
|
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
|
||||||
(int)(value >>> POSITIONS_SHIFT),
|
(int)(value >>> POSITIONS_SHIFT),
|
||||||
@ -35,10 +35,10 @@ public record EdgePageWordMetadata(int tfIdf,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageWordMetadata(int tfIdf,
|
public WordMetadata(int tfIdf,
|
||||||
int positions,
|
int positions,
|
||||||
int count,
|
int count,
|
||||||
Set<EdgePageWordFlags> flags)
|
Set<EdgePageWordFlags> flags)
|
||||||
{
|
{
|
||||||
this(tfIdf, positions, count, encodeFlags(flags));
|
this(tfIdf, positions, count, encodeFlags(flags));
|
||||||
}
|
}
|
@ -1,64 +0,0 @@
|
|||||||
package nu.marginalia.model;
|
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
|
||||||
import nu.marginalia.model.idx.EdgePageWordMetadata;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import java.util.EnumSet;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
|
|
||||||
class EdgePageWordMetadataTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void codecTest() {
|
|
||||||
verifyCodec("Vanilla case", new EdgePageWordMetadata(32, 0x7f0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
|
||||||
verifyCodec("Position high", new EdgePageWordMetadata(32, 0xff0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
|
||||||
verifyCodec("No flags", new EdgePageWordMetadata(32, 0xff0f0000, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
|
|
||||||
System.out.println(new EdgePageWordMetadata(32, 0x7f0f0005, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
|
||||||
System.out.println(new EdgePageWordMetadata(32, 0xff0f0013, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testClampTfIdfLow() {
|
|
||||||
var original = new EdgePageWordMetadata(0x8000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
|
|
||||||
var encoded = new EdgePageWordMetadata(original.encode());
|
|
||||||
|
|
||||||
assertEquals(original.positions(), encoded.positions());
|
|
||||||
assertEquals(0, encoded.tfIdf());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testClampTfIdfHigh() {
|
|
||||||
var original = new EdgePageWordMetadata(0x7000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
|
|
||||||
var encoded = new EdgePageWordMetadata(original.encode());
|
|
||||||
|
|
||||||
assertEquals(original.positions(), encoded.positions());
|
|
||||||
assertEquals(65535, encoded.tfIdf());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testClampCountLow() {
|
|
||||||
var original = new EdgePageWordMetadata(40, 0, -1, EnumSet.noneOf(EdgePageWordFlags.class));
|
|
||||||
var encoded = new EdgePageWordMetadata(original.encode());
|
|
||||||
|
|
||||||
assertEquals(original.positions(), encoded.positions());
|
|
||||||
assertEquals(0, encoded.count());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testClampCountHigh() {
|
|
||||||
var original = new EdgePageWordMetadata(40, 0, 17, EnumSet.noneOf(EdgePageWordFlags.class));
|
|
||||||
var encoded = new EdgePageWordMetadata(original.encode());
|
|
||||||
|
|
||||||
assertEquals(original.positions(), encoded.positions());
|
|
||||||
assertEquals(15, encoded.count());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void verifyCodec(String message, EdgePageWordMetadata data) {
|
|
||||||
assertEquals(data, new EdgePageWordMetadata(data.encode()), message);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
@ -0,0 +1,64 @@
|
|||||||
|
package nu.marginalia.model;
|
||||||
|
|
||||||
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.EnumSet;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
class WordMetadataTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void codecTest() {
|
||||||
|
verifyCodec("Vanilla case", new WordMetadata(32, 0x7f0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||||
|
verifyCodec("Position high", new WordMetadata(32, 0xff0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||||
|
verifyCodec("No flags", new WordMetadata(32, 0xff0f0000, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
|
||||||
|
System.out.println(new WordMetadata(32, 0x7f0f0005, 1, EnumSet.allOf(EdgePageWordFlags.class)));
|
||||||
|
System.out.println(new WordMetadata(32, 0xff0f0013, 1, EnumSet.noneOf(EdgePageWordFlags.class)));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testClampTfIdfLow() {
|
||||||
|
var original = new WordMetadata(0x8000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||||
|
var encoded = new WordMetadata(original.encode());
|
||||||
|
|
||||||
|
assertEquals(original.positions(), encoded.positions());
|
||||||
|
assertEquals(0, encoded.tfIdf());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testClampTfIdfHigh() {
|
||||||
|
var original = new WordMetadata(0x7000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||||
|
var encoded = new WordMetadata(original.encode());
|
||||||
|
|
||||||
|
assertEquals(original.positions(), encoded.positions());
|
||||||
|
assertEquals(65535, encoded.tfIdf());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testClampCountLow() {
|
||||||
|
var original = new WordMetadata(40, 0, -1, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||||
|
var encoded = new WordMetadata(original.encode());
|
||||||
|
|
||||||
|
assertEquals(original.positions(), encoded.positions());
|
||||||
|
assertEquals(0, encoded.count());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testClampCountHigh() {
|
||||||
|
var original = new WordMetadata(40, 0, 17, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||||
|
var encoded = new WordMetadata(original.encode());
|
||||||
|
|
||||||
|
assertEquals(original.positions(), encoded.positions());
|
||||||
|
assertEquals(15, encoded.count());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void verifyCodec(String message, WordMetadata data) {
|
||||||
|
assertEquals(data, new WordMetadata(data.encode()), message);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -4,6 +4,7 @@ These are packages containing the basic building blocks for running a service as
|
|||||||
as shared models.
|
as shared models.
|
||||||
|
|
||||||
* [config](config/) contains some `@Inject`ables.
|
* [config](config/) contains some `@Inject`ables.
|
||||||
|
* [renderer](renderer/) contains utility code for rendering website templates.
|
||||||
* [service](service/) is the shared base classes for main methods and web services.
|
* [service](service/) is the shared base classes for main methods and web services.
|
||||||
* [service-client](service-client/) is the shared base class for RPC.
|
* [service-client](service-client/) is the shared base class for RPC.
|
||||||
* [service-discovery](service-discovery) contains tools that lets the services find each other.
|
* [service-discovery](service-discovery) contains tools that lets the services find each other.
|
7
common/renderer/readme.md
Normal file
7
common/renderer/readme.md
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
# Renderer
|
||||||
|
|
||||||
|
Renders handlebar-style templates for the user-facing services.
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [Mustache Renderer](src/main/java/nu/marginalia/renderer/MustacheRenderer.java)
|
10
common/service-client/readme.md
Normal file
10
common/service-client/readme.md
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
# Service Client
|
||||||
|
|
||||||
|
These are base classes for all the [API](../../api) clients for talking to other [services](../service).
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [AbstractDynamicClient](src/main/java/nu/marginalia/client/AbstractDynamicClient.java) base class for API clients
|
||||||
|
* [AbstractClient](src/main/java/nu/marginalia/client/AbstractClient.java) handles requests at a lower level
|
||||||
|
* [Context](src/main/java/nu/marginalia/client/Context.java) handles request tracking
|
||||||
|
* [ContextScrambler](src/main/java/nu/marginalia/client/ContextScrambler.java) handles anonymization of public IPs
|
3
common/service-discovery/readme.md
Normal file
3
common/service-discovery/readme.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Service Discovery
|
||||||
|
|
||||||
|
Contains classes for helping services discover each other.
|
9
common/service/readme.md
Normal file
9
common/service/readme.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# Service
|
||||||
|
|
||||||
|
Contains the base classes for the services. This is where port configuration,
|
||||||
|
and common endpoints are set up.
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [MainClass](src/main/java/nu/marginalia/service/MainClass.java) bootstraps all executables
|
||||||
|
* [Service](src/main/java/nu/marginalia/service/server/Service.java) base class for all services.
|
@ -15,7 +15,7 @@ dependencies {
|
|||||||
implementation project(':third-party')
|
implementation project(':third-party')
|
||||||
implementation project(':common:model')
|
implementation project(':common:model')
|
||||||
implementation project(':common:config')
|
implementation project(':common:config')
|
||||||
implementation project(':libraries:misc')
|
implementation project(':libraries:guarded-regex')
|
||||||
implementation project(':crawl:crawling-model')
|
implementation project(':crawl:crawling-model')
|
||||||
|
|
||||||
implementation libs.notnull
|
implementation libs.notnull
|
||||||
|
3
crawl/common/readme.md
Normal file
3
crawl/common/readme.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Crawl/Common
|
||||||
|
|
||||||
|
Contains model classes shared by the whole crawl-process-load ecosystem.
|
@ -23,6 +23,7 @@ dependencies {
|
|||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
implementation libs.notnull
|
implementation libs.notnull
|
||||||
|
implementation libs.trove
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
|
4
crawl/converting-model/readme.md
Normal file
4
crawl/converting-model/readme.md
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# Converting Models
|
||||||
|
|
||||||
|
Contains models shared by the [converting-process](../converting-process/) and
|
||||||
|
[loading-process](../loading-process/).
|
@ -3,8 +3,8 @@ package nu.marginalia.converting.instruction;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.crawl.DocumentKeywords;
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
||||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
|
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
|
||||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
|
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
|
||||||
@ -19,7 +19,7 @@ public interface Interpreter {
|
|||||||
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
|
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
|
||||||
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
|
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
|
||||||
|
|
||||||
void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words);
|
void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words);
|
||||||
|
|
||||||
void loadDomainRedirect(DomainLink link);
|
void loadDomainRedirect(DomainLink link);
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
package nu.marginalia.converting.instruction.instructions;
|
package nu.marginalia.converting.instruction.instructions;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.DocumentKeywords;
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.converting.instruction.Instruction;
|
import nu.marginalia.converting.instruction.Instruction;
|
||||||
import nu.marginalia.converting.instruction.InstructionTag;
|
import nu.marginalia.converting.instruction.InstructionTag;
|
||||||
import nu.marginalia.converting.instruction.Interpreter;
|
import nu.marginalia.converting.instruction.Interpreter;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
public record LoadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) implements Instruction {
|
public record LoadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) implements Instruction {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void apply(Interpreter interpreter) {
|
public void apply(Interpreter interpreter) {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.model.crawl;
|
package nu.marginalia.converting.model;
|
||||||
|
|
||||||
|
|
||||||
import nu.marginalia.model.idx.EdgePageWordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
@ -9,7 +9,7 @@ public record DocumentKeywords(
|
|||||||
String[] keywords,
|
String[] keywords,
|
||||||
long[] metadata) {
|
long[] metadata) {
|
||||||
|
|
||||||
public DocumentKeywords(EdgePageWords words) {
|
DocumentKeywords(DocumentKeywordsBuilder words) {
|
||||||
this(words.words.toArray(String[]::new),
|
this(words.words.toArray(String[]::new),
|
||||||
words.metadata.toArray());
|
words.metadata.toArray());
|
||||||
}
|
}
|
||||||
@ -22,7 +22,7 @@ public record DocumentKeywords(
|
|||||||
for (int i = 0; i < keywords.length; i++) {
|
for (int i = 0; i < keywords.length; i++) {
|
||||||
sb.append("\n\t ");
|
sb.append("\n\t ");
|
||||||
if (metadata[i] != 0) {
|
if (metadata[i] != 0) {
|
||||||
sb.append(keywords[i]).append("/").append(new EdgePageWordMetadata(metadata[i]));
|
sb.append(keywords[i]).append("/").append(new WordMetadata(metadata[i]));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
sb.append(keywords[i]);
|
sb.append(keywords[i]);
|
@ -1,29 +1,30 @@
|
|||||||
package nu.marginalia.model.crawl;
|
package nu.marginalia.converting.model;
|
||||||
|
|
||||||
import gnu.trove.list.array.TLongArrayList;
|
import gnu.trove.list.array.TLongArrayList;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.function.UnaryOperator;
|
import java.util.function.UnaryOperator;
|
||||||
|
|
||||||
@ToString @Getter
|
@ToString @Getter
|
||||||
public class EdgePageWords {
|
public class DocumentKeywordsBuilder {
|
||||||
public final ArrayList<String> words = new ArrayList<>();
|
public final ArrayList<String> words = new ArrayList<>();
|
||||||
public final TLongArrayList metadata = new TLongArrayList();
|
public final TLongArrayList metadata = new TLongArrayList();
|
||||||
|
public DocumentKeywordsBuilder() {
|
||||||
public EdgePageWords() {
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageWords(int cacpacity) {
|
public DocumentKeywords build() {
|
||||||
|
return new DocumentKeywords(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentKeywordsBuilder(int cacpacity) {
|
||||||
words.ensureCapacity(cacpacity);
|
words.ensureCapacity(cacpacity);
|
||||||
metadata.ensureCapacity(cacpacity);
|
metadata.ensureCapacity(cacpacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageWords(Collection<Entry> initial) {
|
public DocumentKeywordsBuilder(Collection<Entry> initial) {
|
||||||
|
|
||||||
words.ensureCapacity(initial.size());
|
words.ensureCapacity(initial.size());
|
||||||
metadata.ensureCapacity(initial.size());
|
metadata.ensureCapacity(initial.size());
|
||||||
@ -33,14 +34,14 @@ public class EdgePageWords {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static EdgePageWords withBlankMetadata(List<String> entries) {
|
public static DocumentKeywordsBuilder withBlankMetadata(List<String> entries) {
|
||||||
List<Long> emptyMeta = new ArrayList<>(entries.size());
|
List<Long> emptyMeta = new ArrayList<>(entries.size());
|
||||||
|
|
||||||
for (int i = 0; i < entries.size(); i++) {
|
for (int i = 0; i < entries.size(); i++) {
|
||||||
emptyMeta.add(0L);
|
emptyMeta.add(0L);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new EdgePageWords(entries, emptyMeta);
|
return new DocumentKeywordsBuilder(entries, emptyMeta);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addJustNoMeta(String word) {
|
public void addJustNoMeta(String word) {
|
||||||
@ -48,7 +49,7 @@ public class EdgePageWords {
|
|||||||
metadata.add(0);
|
metadata.add(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgePageWords(List<String> words, List<Long> meta) {
|
private DocumentKeywordsBuilder(List<String> words, List<Long> meta) {
|
||||||
|
|
||||||
this.words.addAll(words);
|
this.words.addAll(words);
|
||||||
this.metadata.addAll(meta);
|
this.metadata.addAll(meta);
|
@ -21,7 +21,9 @@ dependencies {
|
|||||||
implementation project(':common:model')
|
implementation project(':common:model')
|
||||||
implementation project(':common:service')
|
implementation project(':common:service')
|
||||||
implementation project(':common:config')
|
implementation project(':common:config')
|
||||||
implementation project(':libraries:misc')
|
implementation project(':libraries:guarded-regex')
|
||||||
|
implementation project(':libraries:easy-lsh')
|
||||||
|
implementation project(':libraries:big-string')
|
||||||
implementation project(':api:index-api')
|
implementation project(':api:index-api')
|
||||||
implementation project(':common:service-discovery')
|
implementation project(':common:service-discovery')
|
||||||
implementation project(':common:service-client')
|
implementation project(':common:service-client')
|
||||||
|
@ -2,9 +2,9 @@ package nu.marginalia.converting;
|
|||||||
|
|
||||||
import com.github.luben.zstd.ZstdOutputStream;
|
import com.github.luben.zstd.ZstdOutputStream;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.converting.instruction.Interpreter;
|
import nu.marginalia.converting.instruction.Interpreter;
|
||||||
import nu.marginalia.model.crawl.DocumentKeywords;
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
||||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
|
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
|
||||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
|
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
|
||||||
@ -60,7 +60,7 @@ public class ConversionLog implements AutoCloseable, Interpreter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) {}
|
public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadDomainRedirect(DomainLink link) {}
|
public void loadDomainRedirect(DomainLink link) {}
|
||||||
|
@ -3,10 +3,10 @@ package nu.marginalia.converting;
|
|||||||
import com.github.luben.zstd.ZstdOutputStream;
|
import com.github.luben.zstd.ZstdOutputStream;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.converting.instruction.Instruction;
|
import nu.marginalia.converting.instruction.Instruction;
|
||||||
import nu.marginalia.converting.instruction.Interpreter;
|
import nu.marginalia.converting.instruction.Interpreter;
|
||||||
import nu.marginalia.model.crawl.DocumentKeywords;
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
||||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
|
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
|
||||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
|
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
|
||||||
@ -121,7 +121,7 @@ public class InstructionWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) {
|
public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.compiler;
|
package nu.marginalia.converting.compiler;
|
||||||
|
|
||||||
import nu.marginalia.converting.instruction.Instruction;
|
import nu.marginalia.converting.instruction.Instruction;
|
||||||
import nu.marginalia.model.crawl.DocumentKeywords;
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
import nu.marginalia.converting.instruction.instructions.LoadKeywords;
|
import nu.marginalia.converting.instruction.instructions.LoadKeywords;
|
||||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
|
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
|
||||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
|
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
|
||||||
@ -39,7 +39,7 @@ public class DocumentsCompiler {
|
|||||||
var words = doc.words;
|
var words = doc.words;
|
||||||
|
|
||||||
if (words != null) {
|
if (words != null) {
|
||||||
ret.add(new LoadKeywords(doc.url, doc.details.metadata, new DocumentKeywords(words)));
|
ret.add(new LoadKeywords(doc.url, doc.details.metadata, words.build()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@ package nu.marginalia.converting.model;
|
|||||||
|
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
||||||
import nu.marginalia.model.crawl.EdgePageWords;
|
|
||||||
import nu.marginalia.model.crawl.EdgeUrlState;
|
import nu.marginalia.model.crawl.EdgeUrlState;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
@ -13,7 +12,7 @@ public class ProcessedDocument {
|
|||||||
public EdgeUrl url;
|
public EdgeUrl url;
|
||||||
|
|
||||||
public ProcessedDocumentDetails details;
|
public ProcessedDocumentDetails details;
|
||||||
public EdgePageWords words;
|
public DocumentKeywordsBuilder words;
|
||||||
|
|
||||||
public EdgeUrlState state;
|
public EdgeUrlState state;
|
||||||
public String stateReason;
|
public String stateReason;
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.converting.model;
|
|||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
@ -29,5 +29,5 @@ public class ProcessedDocumentDetails {
|
|||||||
public List<EdgeUrl> linksExternal;
|
public List<EdgeUrl> linksExternal;
|
||||||
public List<EdgeUrl> feedLinks;
|
public List<EdgeUrl> feedLinks;
|
||||||
|
|
||||||
public EdgePageDocumentsMetadata metadata;
|
public DocumentMetadata metadata;
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ import nu.marginalia.language.model.DocumentLanguageData;
|
|||||||
import nu.marginalia.language.model.KeywordMetadata;
|
import nu.marginalia.language.model.KeywordMetadata;
|
||||||
import nu.marginalia.language.model.WordRep;
|
import nu.marginalia.language.model.WordRep;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
import nu.marginalia.model.crawl.EdgePageWords;
|
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import javax.inject.Inject;
|
||||||
@ -33,7 +33,7 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public EdgePageWords extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
public DocumentKeywordsBuilder extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||||
|
|
||||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||||
@ -45,7 +45,7 @@ public class DocumentKeywordExtractor {
|
|||||||
|
|
||||||
List<String> artifacts = getArtifacts(documentLanguageData);
|
List<String> artifacts = getArtifacts(documentLanguageData);
|
||||||
|
|
||||||
WordsBuilder wordsBuilder = new WordsBuilder();
|
FilteringDocumentKeywordsBuilder wordsBuilder = new FilteringDocumentKeywordsBuilder();
|
||||||
|
|
||||||
createWords(wordsBuilder, keywordMetadata, titleWords, 0);
|
createWords(wordsBuilder, keywordMetadata, titleWords, 0);
|
||||||
artifacts.forEach(wordsBuilder::addWithBlankMetadata);
|
artifacts.forEach(wordsBuilder::addWithBlankMetadata);
|
||||||
@ -53,7 +53,7 @@ public class DocumentKeywordExtractor {
|
|||||||
return wordsBuilder.build();
|
return wordsBuilder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageWords extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||||
|
|
||||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
|
|
||||||
@ -71,7 +71,7 @@ public class DocumentKeywordExtractor {
|
|||||||
|
|
||||||
List<String> artifacts = getArtifacts(documentLanguageData);
|
List<String> artifacts = getArtifacts(documentLanguageData);
|
||||||
|
|
||||||
WordsBuilder wordsBuilder = new WordsBuilder();
|
FilteringDocumentKeywordsBuilder wordsBuilder = new FilteringDocumentKeywordsBuilder();
|
||||||
|
|
||||||
createWords(wordsBuilder, keywordMetadata, titleWords, 0);
|
createWords(wordsBuilder, keywordMetadata, titleWords, 0);
|
||||||
createWords(wordsBuilder, keywordMetadata, wordsTfIdf, EdgePageWordFlags.TfIdfHigh.asBit());
|
createWords(wordsBuilder, keywordMetadata, wordsTfIdf, EdgePageWordFlags.TfIdfHigh.asBit());
|
||||||
@ -143,7 +143,7 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void getSimpleWords(WordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) {
|
private void getSimpleWords(FilteringDocumentKeywordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) {
|
||||||
|
|
||||||
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
|
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
|
||||||
|
|
||||||
@ -207,10 +207,10 @@ public class DocumentKeywordExtractor {
|
|||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void createWords(WordsBuilder wordsBuilder,
|
public void createWords(FilteringDocumentKeywordsBuilder wordsBuilder,
|
||||||
KeywordMetadata metadata,
|
KeywordMetadata metadata,
|
||||||
Collection<WordRep> words,
|
Collection<WordRep> words,
|
||||||
long additionalMeta) {
|
long additionalMeta) {
|
||||||
|
|
||||||
for (var word : words) {
|
for (var word : words) {
|
||||||
|
|
||||||
@ -223,8 +223,8 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class WordsBuilder {
|
private static class FilteringDocumentKeywordsBuilder {
|
||||||
private final EdgePageWords words = new EdgePageWords(1600);
|
private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder(1600);
|
||||||
private final Set<String> seen = new HashSet<>(1600);
|
private final Set<String> seen = new HashSet<>(1600);
|
||||||
|
|
||||||
public void add(String word, long meta) {
|
public void add(String word, long meta) {
|
||||||
@ -238,7 +238,7 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageWords build() {
|
public DocumentKeywordsBuilder build() {
|
||||||
return words;
|
return words;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ import nu.marginalia.crawling.model.CrawledDomain;
|
|||||||
import nu.marginalia.language.LanguageFilter;
|
import nu.marginalia.language.LanguageFilter;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
||||||
import nu.marginalia.model.crawl.EdgePageWords;
|
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.model.DisqualifiedException;
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||||
@ -31,7 +31,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
|||||||
protected static class MetaTagsBuilder {
|
protected static class MetaTagsBuilder {
|
||||||
private final Set<String> tagWords = new HashSet<>();
|
private final Set<String> tagWords = new HashSet<>();
|
||||||
|
|
||||||
public void build(EdgePageWords dest) {
|
public void build(DocumentKeywordsBuilder dest) {
|
||||||
dest.addAllSyntheticTerms(tagWords);
|
dest.addAllSyntheticTerms(tagWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,5 +84,5 @@ public abstract class AbstractDocumentProcessorPlugin {
|
|||||||
|
|
||||||
|
|
||||||
public record DetailsWithWords(ProcessedDocumentDetails details,
|
public record DetailsWithWords(ProcessedDocumentDetails details,
|
||||||
EdgePageWords words) {}
|
DocumentKeywordsBuilder words) {}
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,8 @@ import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;
|
|||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
||||||
import nu.marginalia.model.crawl.EdgePageWords;
|
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.converting.processor.logic.*;
|
import nu.marginalia.converting.processor.logic.*;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
@ -121,9 +121,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
ret.hashCode = dld.localitySensitiveHashCode();
|
ret.hashCode = dld.localitySensitiveHashCode();
|
||||||
|
|
||||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
||||||
ret.metadata = new EdgePageDocumentsMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class));
|
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class));
|
||||||
|
|
||||||
EdgePageWords words = keywordExtractor.extractKeywords(dld, keywordMetadata);
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, keywordMetadata);
|
||||||
|
|
||||||
new MetaTagsBuilder()
|
new MetaTagsBuilder()
|
||||||
.addDomainCrawlData(crawledDomain)
|
.addDomainCrawlData(crawledDomain)
|
||||||
@ -173,7 +173,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWords words) {
|
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, DocumentKeywordsBuilder words) {
|
||||||
|
|
||||||
final LinkProcessor lp = new LinkProcessor(ret, baseUrl);
|
final LinkProcessor lp = new LinkProcessor(ret, baseUrl);
|
||||||
|
|
||||||
@ -208,7 +208,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
createFileLinkKeywords(words, lp, domain);
|
createFileLinkKeywords(words, lp, domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void createFileLinkKeywords(EdgePageWords words, LinkProcessor lp, EdgeDomain domain) {
|
private void createFileLinkKeywords(DocumentKeywordsBuilder words, LinkProcessor lp, EdgeDomain domain) {
|
||||||
Set<String> fileKeywords = new HashSet<>(100);
|
Set<String> fileKeywords = new HashSet<>(100);
|
||||||
for (var link : lp.getNonIndexableUrls()) {
|
for (var link : lp.getNonIndexableUrls()) {
|
||||||
|
|
||||||
@ -241,7 +241,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
fileKeywords.add(filename.replace(' ', '_'));
|
fileKeywords.add(filename.replace(' ', '_'));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void createLinkKeywords(EdgePageWords words, LinkProcessor lp) {
|
private void createLinkKeywords(DocumentKeywordsBuilder words, LinkProcessor lp) {
|
||||||
final Set<String> linkTerms = new HashSet<>();
|
final Set<String> linkTerms = new HashSet<>();
|
||||||
|
|
||||||
for (var fd : lp.getForeignDomains()) {
|
for (var fd : lp.getForeignDomains()) {
|
||||||
|
@ -9,8 +9,8 @@ import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;
|
|||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.model.crawl.EdgeHtmlStandard;
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
||||||
import nu.marginalia.model.crawl.EdgePageWords;
|
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.converting.model.DisqualifiedException;
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||||
@ -89,10 +89,10 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
|
|
||||||
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
|
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
|
||||||
|
|
||||||
ret.metadata = new EdgePageDocumentsMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText));
|
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText));
|
||||||
|
|
||||||
KeywordMetadata keywordMetadata = new KeywordMetadata();
|
KeywordMetadata keywordMetadata = new KeywordMetadata();
|
||||||
EdgePageWords words = keywordExtractor.extractKeywords(dld, keywordMetadata);
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, keywordMetadata);
|
||||||
|
|
||||||
new MetaTagsBuilder()
|
new MetaTagsBuilder()
|
||||||
.addDomainCrawlData(crawledDomain)
|
.addDomainCrawlData(crawledDomain)
|
||||||
|
@ -12,7 +12,7 @@ import nu.marginalia.language.keywords.KeywordExtractor;
|
|||||||
import nu.marginalia.language.model.WordSeparator;
|
import nu.marginalia.language.model.WordSeparator;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
import nu.marginalia.model.idx.EdgePageWordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.test.util.TestLanguageModels;
|
import nu.marginalia.test.util.TestLanguageModels;
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@ -128,14 +128,14 @@ class SentenceExtractorTest {
|
|||||||
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
|
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
|
||||||
var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata());
|
var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata());
|
||||||
|
|
||||||
var terms = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i))))
|
var terms = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new WordMetadata(newRes.metadata.get(i))))
|
||||||
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
|
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
|
||||||
.limit(100)
|
.limit(100)
|
||||||
.map(Pair::getKey)
|
.map(Pair::getKey)
|
||||||
.toArray(String[]::new);
|
.toArray(String[]::new);
|
||||||
System.out.println(Arrays.toString(terms));
|
System.out.println(Arrays.toString(terms));
|
||||||
|
|
||||||
var terms2 = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i))))
|
var terms2 = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new WordMetadata(newRes.metadata.get(i))))
|
||||||
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
|
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
|
||||||
.filter(e -> e.getValue().hasFlag(EdgePageWordFlags.Subjects))
|
.filter(e -> e.getValue().hasFlag(EdgePageWordFlags.Subjects))
|
||||||
.limit(100)
|
.limit(100)
|
||||||
|
4
crawl/crawl-job-extractor-process/readme.md
Normal file
4
crawl/crawl-job-extractor-process/readme.md
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# Crawl Job Extractor
|
||||||
|
|
||||||
|
The crawl job extractor creates a file containing a list of domains
|
||||||
|
along with known URLs. This is consumed by the [crawling-process](../crawling-process).
|
@ -15,7 +15,7 @@ java {
|
|||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party')
|
implementation project(':third-party')
|
||||||
implementation project(':common:model')
|
implementation project(':common:model')
|
||||||
implementation project(':libraries:misc')
|
implementation project(':libraries:big-string')
|
||||||
implementation project(':api:index-api')
|
implementation project(':api:index-api')
|
||||||
implementation project(':common:service-discovery')
|
implementation project(':common:service-discovery')
|
||||||
implementation project(':common:service-client')
|
implementation project(':common:service-client')
|
||||||
|
4
crawl/crawling-model/readme.md
Normal file
4
crawl/crawling-model/readme.md
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# Crawling Models
|
||||||
|
|
||||||
|
Contains models shared by the [crawling-process](../crawling-process/) and
|
||||||
|
[converting-process](../converting-process/).
|
@ -21,7 +21,7 @@ dependencies {
|
|||||||
implementation project(':common:model')
|
implementation project(':common:model')
|
||||||
implementation project(':common:config')
|
implementation project(':common:config')
|
||||||
implementation project(':common:service')
|
implementation project(':common:service')
|
||||||
implementation project(':libraries:misc')
|
implementation project(':libraries:big-string')
|
||||||
implementation project(':api:index-api')
|
implementation project(':api:index-api')
|
||||||
implementation project(':common:service-discovery')
|
implementation project(':common:service-discovery')
|
||||||
implementation project(':common:service-client')
|
implementation project(':common:service-client')
|
||||||
|
@ -16,7 +16,7 @@ dependencies {
|
|||||||
implementation project(':common:model')
|
implementation project(':common:model')
|
||||||
implementation project(':common:config')
|
implementation project(':common:config')
|
||||||
implementation project(':common:service')
|
implementation project(':common:service')
|
||||||
implementation project(':libraries:misc')
|
implementation project(':libraries:big-string')
|
||||||
implementation project(':api:index-api')
|
implementation project(':api:index-api')
|
||||||
implementation project(':common:service-discovery')
|
implementation project(':common:service-discovery')
|
||||||
implementation project(':common:service-client')
|
implementation project(':common:service-client')
|
||||||
|
3
crawl/experimental/readme.md
Normal file
3
crawl/experimental/readme.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Experimental
|
||||||
|
|
||||||
|
Contains tools for running classification experiments on crawl data.
|
@ -2,9 +2,8 @@ package nu.marginalia.loading.loader;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.client.Context;
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
import nu.marginalia.model.crawl.DocumentKeywords;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.id.EdgeId;
|
import nu.marginalia.model.id.EdgeId;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -19,7 +18,7 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
|
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
|
||||||
private final LoaderIndexJournalWriter client;
|
private final LoaderIndexJournalWriter client;
|
||||||
|
|
||||||
private record InsertTask(int urlId, int domainId, EdgePageDocumentsMetadata metadata, DocumentKeywords wordSet) {}
|
private record InsertTask(int urlId, int domainId, DocumentMetadata metadata, DocumentKeywords wordSet) {}
|
||||||
|
|
||||||
private final Thread runThread;
|
private final Thread runThread;
|
||||||
|
|
||||||
@ -50,7 +49,7 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void load(LoaderData loaderData, EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) throws InterruptedException {
|
public void load(LoaderData loaderData, EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) throws InterruptedException {
|
||||||
int domainId = loaderData.getDomainId(url.domain);
|
int domainId = loaderData.getDomainId(url.domain);
|
||||||
int urlId = loaderData.getUrlId(url);
|
int urlId = loaderData.getUrlId(url);
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.util;
|
package nu.marginalia.loading.loader;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.DocumentKeywords;
|
|
||||||
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
@ -3,9 +3,9 @@ package nu.marginalia.loading.loader;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.converting.instruction.Interpreter;
|
import nu.marginalia.converting.instruction.Interpreter;
|
||||||
import nu.marginalia.model.crawl.DocumentKeywords;
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
||||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
|
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
|
||||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
|
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
|
||||||
@ -103,7 +103,7 @@ public class Loader implements Interpreter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) {
|
public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {
|
||||||
// This is a bit of a bandaid safeguard against a bug in
|
// This is a bit of a bandaid safeguard against a bug in
|
||||||
// in the converter, shouldn't be necessary in the future
|
// in the converter, shouldn't be necessary in the future
|
||||||
if (!deferredDomains.isEmpty()) {
|
if (!deferredDomains.isEmpty()) {
|
||||||
|
@ -3,7 +3,6 @@ package nu.marginalia.loading.loader;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import nu.marginalia.dict.DictionaryMap;
|
|
||||||
import nu.marginalia.dict.OffHeapDictionaryHashMap;
|
import nu.marginalia.dict.OffHeapDictionaryHashMap;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
@ -11,9 +10,8 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
|||||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
import nu.marginalia.lexicon.KeywordLexicon;
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
||||||
import nu.marginalia.model.crawl.DocumentKeywords;
|
import nu.marginalia.converting.model.DocumentKeywords;
|
||||||
import nu.marginalia.util.KeywordListChunker;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.id.EdgeId;
|
import nu.marginalia.model.id.EdgeId;
|
||||||
@ -40,7 +38,7 @@ public class LoaderIndexJournalWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void putWords(EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
public void putWords(EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
||||||
EdgePageDocumentsMetadata metadata,
|
DocumentMetadata metadata,
|
||||||
DocumentKeywords wordSet) {
|
DocumentKeywords wordSet) {
|
||||||
if (wordSet.keywords().length == 0)
|
if (wordSet.keywords().length == 0)
|
||||||
return;
|
return;
|
||||||
|
@ -95,6 +95,7 @@ services:
|
|||||||
- search-service
|
- search-service
|
||||||
networks:
|
networks:
|
||||||
wmsa:
|
wmsa:
|
||||||
|
screenshot:
|
||||||
volumes:
|
volumes:
|
||||||
db:
|
db:
|
||||||
driver: local
|
driver: local
|
||||||
|
15
features/domain-ranking/readme.md
Normal file
15
features/domain-ranking/readme.md
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
# Domain Ranking
|
||||||
|
|
||||||
|
Contains domain ranking algorithms.
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
### Algorithms
|
||||||
|
* [RankingAlgorithm](src/main/java/nu/marginalia/ranking/RankingAlgorithm.java)
|
||||||
|
* [StandardPageRank](src/main/java/nu/marginalia/ranking/StandardPageRank.java)
|
||||||
|
* [ReversePageRank](src/main/java/nu/marginalia/ranking/ReversePageRank.java) "CheiRank"
|
||||||
|
|
||||||
|
### Data sources
|
||||||
|
|
||||||
|
* [RankingDomainFetcher](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java) fetches link data.
|
||||||
|
* [RankingDomainFetcherForSimilarityData](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java) fetches website similarity data.
|
10
features/query-parser/readme.md
Normal file
10
features/query-parser/readme.md
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
# Query Parser
|
||||||
|
|
||||||
|
End-user search query parsing tools used by the [search-service](../../services-core/search-service).
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [QueryTokenizer](src/main/java/nu/marginalia/query_parser/QueryTokenizer.java)
|
||||||
|
* [QueryParser](src/main/java/nu/marginalia/query_parser/QueryParser.java)
|
||||||
|
* [QueryPermutations](src/main/java/nu/marginalia/query_parser/QueryVariants.java) - here be dragons
|
||||||
|
* [QueryVariants](src/main/java/nu/marginalia/query_parser/QueryVariants.java) - here be dragons
|
@ -1,12 +1,11 @@
|
|||||||
# Features
|
# Features
|
||||||
|
|
||||||
These are bits of code that are relatively isolated pieces of business logic,
|
These are bits of search-engine related code that are relatively isolated pieces of business logic,
|
||||||
that benefit from the clarity of being kept separate from the rest of the
|
that benefit from the clarity of being kept separate from the rest of the
|
||||||
search engine code.
|
search engine code.
|
||||||
|
|
||||||
* [domain-ranking](domain-ranking/) contains ranking algorithms.
|
* [domain-ranking](domain-ranking/) contains ranking algorithms.
|
||||||
* [query-parser](query-parser/) contains code for parsing the user-facing query grammar.
|
* [query-parser](query-parser/) contains code for parsing the user-facing query grammar.
|
||||||
* [renderer](renderer/) contains utility code for rendering website templates.
|
|
||||||
|
|
||||||
* [screenshots](screenshots/) and [random-websites](random-websites/) contains SQL queries random
|
* [screenshots](screenshots/) and [random-websites](random-websites/) contains SQL queries random
|
||||||
exploration mode.
|
exploration mode.
|
@ -5,7 +5,7 @@ import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
|||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
import nu.marginalia.ranking.DomainRankings;
|
||||||
import org.roaringbitmap.IntConsumer;
|
import org.roaringbitmap.IntConsumer;
|
||||||
import org.roaringbitmap.RoaringBitmap;
|
import org.roaringbitmap.RoaringBitmap;
|
||||||
@ -72,7 +72,7 @@ public class ForwardIndexConverter {
|
|||||||
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.urlId());
|
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.urlId());
|
||||||
|
|
||||||
int ranking = domainRankings.getRanking(entry.domainId());
|
int ranking = domainRankings.getRanking(entry.domainId());
|
||||||
long meta = EdgePageDocumentsMetadata.encodeRank(entry.docMeta(), ranking);
|
long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);
|
||||||
|
|
||||||
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
||||||
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());
|
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward;
|
||||||
|
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||||
import nu.marginalia.index.query.IndexQueryParams;
|
import nu.marginalia.index.query.IndexQueryParams;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
@ -52,7 +52,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
final int quality = EdgePageDocumentsMetadata.decodeQuality(post.meta());
|
final int quality = DocumentMetadata.decodeQuality(post.meta());
|
||||||
|
|
||||||
return limit.test(quality);
|
return limit.test(quality);
|
||||||
}
|
}
|
||||||
@ -61,7 +61,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
if (params.year().type() == SpecificationLimitType.NONE)
|
if (params.year().type() == SpecificationLimitType.NONE)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
int postVal = EdgePageDocumentsMetadata.decodeYear(post.meta());
|
int postVal = DocumentMetadata.decodeYear(post.meta());
|
||||||
|
|
||||||
return params.year().test(postVal);
|
return params.year().test(postVal);
|
||||||
}
|
}
|
||||||
@ -70,7 +70,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
if (params.size().type() == SpecificationLimitType.NONE)
|
if (params.size().type() == SpecificationLimitType.NONE)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
int postVal = EdgePageDocumentsMetadata.decodeSize(post.meta());
|
int postVal = DocumentMetadata.decodeSize(post.meta());
|
||||||
|
|
||||||
return params.size().test(postVal);
|
return params.size().test(postVal);
|
||||||
}
|
}
|
||||||
@ -79,7 +79,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
if (params.rank().type() == SpecificationLimitType.NONE)
|
if (params.rank().type() == SpecificationLimitType.NONE)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
int postVal = EdgePageDocumentsMetadata.decodeRank(post.meta());
|
int postVal = DocumentMetadata.decodeRank(post.meta());
|
||||||
|
|
||||||
return params.rank().test(postVal);
|
return params.rank().test(postVal);
|
||||||
}
|
}
|
||||||
|
17
index/index-journal/readme.md
Normal file
17
index/index-journal/readme.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
# Index Journal
|
||||||
|
|
||||||
|
The index journal contains a list of entries with keywords and keyword metadata per document.
|
||||||
|
|
||||||
|
This journal is written by [crawl/loading-process](../../crawl/loading-process) and read
|
||||||
|
when constructing the [forward](../index-forward) and [reverse](../index-reverse)
|
||||||
|
indices.
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
### Model
|
||||||
|
* [IndexJournalEntry](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java)
|
||||||
|
* [IndexJournalEntryHeader](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java)
|
||||||
|
* [IndexJournalEntryData](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java)
|
||||||
|
### I/O
|
||||||
|
* [IndexJournalReader](src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java)
|
||||||
|
* [IndexJournalWriter](src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriter.java)
|
@ -14,7 +14,7 @@ java {
|
|||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':libraries:array')
|
implementation project(':libraries:array')
|
||||||
implementation project(':libraries:btree')
|
implementation project(':libraries:btree')
|
||||||
implementation project(':libraries:misc')
|
implementation project(':libraries:random-write-funnel')
|
||||||
implementation project(':features:domain-ranking')
|
implementation project(':features:domain-ranking')
|
||||||
implementation project(':index:index-query')
|
implementation project(':index:index-query')
|
||||||
implementation project(':index:index-journal')
|
implementation project(':index:index-journal')
|
||||||
|
@ -5,7 +5,7 @@ import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
|||||||
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
import nu.marginalia.ranking.DomainRankings;
|
||||||
import nu.marginalia.util.RandomWriteFunnel;
|
import nu.marginalia.rwf.RandomWriteFunnel;
|
||||||
import nu.marginalia.array.IntArray;
|
import nu.marginalia.array.IntArray;
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.algo.SortingContext;
|
import nu.marginalia.array.algo.SortingContext;
|
||||||
|
@ -2,7 +2,6 @@ package nu.marginalia.index.reverse;
|
|||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||||
import nu.marginalia.dict.OffHeapDictionaryHashMap;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntry;
|
import nu.marginalia.index.journal.model.IndexJournalEntry;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
||||||
@ -11,7 +10,7 @@ import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior;
|
|||||||
import nu.marginalia.ranking.DomainRankings;
|
import nu.marginalia.ranking.DomainRankings;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
import nu.marginalia.lexicon.KeywordLexicon;
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.test.TestUtil;
|
import nu.marginalia.test.TestUtil;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -56,7 +55,7 @@ class ReverseIndexConverterTest {
|
|||||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
||||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||||
|
|
||||||
var entryBuilder = IndexJournalEntry.builder(id, EdgePageDocumentsMetadata.defaultValue());
|
var entryBuilder = IndexJournalEntry.builder(id, DocumentMetadata.defaultValue());
|
||||||
|
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i]);
|
entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i]);
|
||||||
|
@ -2,7 +2,6 @@ package nu.marginalia.index.reverse;
|
|||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||||
import nu.marginalia.dict.OffHeapDictionaryHashMap;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
||||||
|
@ -21,6 +21,7 @@ dependencies {
|
|||||||
|
|
||||||
implementation libs.prometheus
|
implementation libs.prometheus
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
|
implementation libs.fastutil
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
|
15
index/lexicon/readme.md
Normal file
15
index/lexicon/readme.md
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
# Lexicon
|
||||||
|
|
||||||
|
The lexicon contains a mapping for words to identifiers. This lexicon is populated from a journal.
|
||||||
|
The actual word data isn't mapped, but rather a 64 bit hash.
|
||||||
|
|
||||||
|
The lexicon is written by [crawl/loading-process](../../crawl/loading-process) and read when
|
||||||
|
[services-core/index-service](../../services-core/index-service) interprets queries.
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [KeywordLexicon](src/main/java/nu/marginalia/lexicon/KeywordLexicon.java)
|
||||||
|
* [KeywordLexiconJournal](src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java)
|
||||||
|
* [DictionaryMap](src/main/java/nu/marginalia/dict/DictionaryMap.java) comes in two versions
|
||||||
|
* * [OnHeapDictionaryMap](src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java) - basically just a fastutil Long2IntOpenHashMap
|
||||||
|
* * [OffHeapDictionaryHashMap](src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java) - a heavily modified trove TLongIntHashMap that uses off heap memory
|
@ -19,8 +19,6 @@ public interface IntArraySearch extends IntArrayBase {
|
|||||||
return LongArraySearch.encodeSearchMiss(pos - 1);
|
return LongArraySearch.encodeSearchMiss(pos - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
default long binarySearch(int key, long fromIndex, long toIndex) {
|
default long binarySearch(int key, long fromIndex, long toIndex) {
|
||||||
long low = 0;
|
long low = 0;
|
||||||
long high = (toIndex - fromIndex) - 1;
|
long high = (toIndex - fromIndex) - 1;
|
||||||
|
26
libraries/big-string/build.gradle
Normal file
26
libraries/big-string/build.gradle
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation libs.lombok
|
||||||
|
annotationProcessor libs.lombok
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.lz4
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
23
libraries/big-string/readme.md
Normal file
23
libraries/big-string/readme.md
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# Big String
|
||||||
|
|
||||||
|
Microlibrary that offers string compression. This is useful when having to load tens of thousands
|
||||||
|
of HTML documents in memory during conversion. XML has been described as the opposite of a compression scheme,
|
||||||
|
and as a result, HTML compresses ridiculously well.
|
||||||
|
|
||||||
|
## Demo
|
||||||
|
|
||||||
|
```java
|
||||||
|
List<BigString> manyBigStrings = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var file : files) {
|
||||||
|
// BigString.encode may or may not compress the string
|
||||||
|
// depeneding on its size
|
||||||
|
manyBigStrings.add(BigString.encode(readFile(file)));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var bs : manyBigStrings) {
|
||||||
|
String decompressedString = bs.decompress();
|
||||||
|
byte[] bytes = bs.getBytes();
|
||||||
|
int len = bs.getLength();
|
||||||
|
}
|
||||||
|
```
|
23
libraries/easy-lsh/build.gradle
Normal file
23
libraries/easy-lsh/build.gradle
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation libs.lombok
|
||||||
|
annotationProcessor libs.lombok
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
66
libraries/easy-lsh/readme.md
Normal file
66
libraries/easy-lsh/readme.md
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
# Easy LSH
|
||||||
|
|
||||||
|
This a simple [Locality-Sensitive Hash](https://en.wikipedia.org/wiki/Locality-sensitive_hashing)
|
||||||
|
for document deduplication. Hashes are compared using their hamming distance.
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [EasyLSH](src/main/java/nu/marginalia/lsh/EasyLSH.java)
|
||||||
|
|
||||||
|
## Demo
|
||||||
|
|
||||||
|
Consider statistical distribution only
|
||||||
|
|
||||||
|
```java
|
||||||
|
var lsh1 = new EasyLSH();
|
||||||
|
lsh1.addUnordered("lorem");
|
||||||
|
lsh1.addUnordered("ipsum");
|
||||||
|
lsh1.addUnordered("dolor");
|
||||||
|
lsh1.addUnordered("sit");
|
||||||
|
lsh1.addUnordered("amet");
|
||||||
|
|
||||||
|
long hash1 = lsh1.get();
|
||||||
|
|
||||||
|
var lsh2 = new EasyLSH();
|
||||||
|
lsh2.addUnordered("amet");
|
||||||
|
lsh2.addUnordered("ipsum");
|
||||||
|
lsh2.addUnordered("lorem");
|
||||||
|
lsh2.addUnordered("dolor");
|
||||||
|
lsh2.addUnordered("SEAT");
|
||||||
|
|
||||||
|
long hash2 = lsh2.get();
|
||||||
|
|
||||||
|
System.out.println(EasyLSH.hammingDistance(lsh1, lsh2));
|
||||||
|
// 1 -- these are similar
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
Consider order as well as distribution
|
||||||
|
|
||||||
|
```java
|
||||||
|
var lsh1 = new EasyLSH();
|
||||||
|
lsh1.addOrdered("lorem");
|
||||||
|
lsh1.addOrdered("ipsum");
|
||||||
|
lsh1.addOrdered("dolor");
|
||||||
|
lsh1.addOrdered("sit");
|
||||||
|
lsh1.addOrdered("amet");
|
||||||
|
|
||||||
|
long hash1 = lsh1.get();
|
||||||
|
|
||||||
|
var lsh2 = new EasyLSH();
|
||||||
|
lsh2.addOrdered("amet");
|
||||||
|
lsh2.addOrdered("ipsum");
|
||||||
|
lsh2.addOrdered("lorem");
|
||||||
|
lsh2.addOrdered("dolor");
|
||||||
|
lsh2.addOrdered("SEAT");
|
||||||
|
|
||||||
|
|
||||||
|
long hash2 = lsh2.get();
|
||||||
|
|
||||||
|
System.out.println(EasyLSH.hammingDistance(lsh1, lsh2));
|
||||||
|
// 5 -- these are not very similar
|
||||||
|
|
||||||
|
// note the value is relatively low because there are few words
|
||||||
|
// and there simply can't be very many differences
|
||||||
|
// it will approach 32 as documents grow larger
|
||||||
|
```
|
@ -18,8 +18,6 @@ public class EasyLSH {
|
|||||||
private static final int SHINGLING = 2;
|
private static final int SHINGLING = 2;
|
||||||
static { assert Integer.bitCount(SHINGLING) == 1; }
|
static { assert Integer.bitCount(SHINGLING) == 1; }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private final int[] fields = new int[64];
|
private final int[] fields = new int[64];
|
||||||
private final int[] prevHashes = new int[SHINGLING];
|
private final int[] prevHashes = new int[SHINGLING];
|
||||||
private int prevHashIdx = 0;
|
private int prevHashIdx = 0;
|
||||||
@ -37,7 +35,7 @@ public class EasyLSH {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void addHashUnordered(int hashCode) {
|
public void addHashUnordered(int hashCode) {
|
||||||
int value = 1- (hashCode & 2);
|
int value = 1 - (hashCode & 2);
|
||||||
|
|
||||||
// Try to extract all the remaining entropy
|
// Try to extract all the remaining entropy
|
||||||
// into selecting the field to update
|
// into selecting the field to update
|
@ -6,6 +6,28 @@ import org.junit.jupiter.api.Test;
|
|||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
class EasyLSHTest {
|
class EasyLSHTest {
|
||||||
|
@Test
|
||||||
|
public void testDemo() {
|
||||||
|
var lsh1 = new EasyLSH();
|
||||||
|
lsh1.addOrdered("lorem");
|
||||||
|
lsh1.addOrdered("ipsum");
|
||||||
|
lsh1.addOrdered("dolor");
|
||||||
|
lsh1.addOrdered("sit");
|
||||||
|
lsh1.addOrdered("amet");
|
||||||
|
|
||||||
|
long hash1 = lsh1.get();
|
||||||
|
|
||||||
|
var lsh2 = new EasyLSH();
|
||||||
|
lsh2.addOrdered("amet");
|
||||||
|
lsh2.addOrdered("ipsum");
|
||||||
|
lsh2.addOrdered("lorem");
|
||||||
|
lsh2.addOrdered("dolor");
|
||||||
|
lsh2.addOrdered("SEAT");
|
||||||
|
|
||||||
|
long hash2 = lsh2.get();
|
||||||
|
|
||||||
|
System.out.println(EasyLSH.hammingDistance(lsh1, lsh2));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEZLSH() {
|
public void testEZLSH() {
|
25
libraries/guarded-regex/build.gradle
Normal file
25
libraries/guarded-regex/build.gradle
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation libs.lombok
|
||||||
|
annotationProcessor libs.lombok
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.notnull
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
37
libraries/guarded-regex/readme.md
Normal file
37
libraries/guarded-regex/readme.md
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
# Guarded Regex
|
||||||
|
|
||||||
|
This is a simple library for creating guarded regular expressions. Pattern matching in Java
|
||||||
|
is pretty slow even with compiled regular expressions.
|
||||||
|
|
||||||
|
Guarding them with a `startsWith()`, `endsWith()` or `contains()` can be an order of magnitude
|
||||||
|
faster, but leads to an unfortunate spreading out of the logic across the pattern and the guard
|
||||||
|
condition.
|
||||||
|
|
||||||
|
Guarded regexes aims to fix this. Instead of code like
|
||||||
|
|
||||||
|
```java
|
||||||
|
Pattern pattern = Pattern.compile("[123]?foo(bar|baz){2,5}");
|
||||||
|
|
||||||
|
void ifTheThingDoTheThing(String str) {
|
||||||
|
if (str.contains("foo") && pattern.matcher(str).matches()) {
|
||||||
|
doTheThing();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
you get the more expressive variant
|
||||||
|
|
||||||
|
```java
|
||||||
|
GuardedRegex thingPredicate =
|
||||||
|
GuardedRegexFactory.contains("foo", "[123]?foo(bar|baz){2,5}");
|
||||||
|
|
||||||
|
void ifTheThingDoTheThing(String str) {
|
||||||
|
if (thingPredicate.test(str)) {
|
||||||
|
doTheThing();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [GuardedRegexFactory](src/main/java/nu/marginalia/gregex/GuardedRegexFactory.java)
|
@ -18,7 +18,7 @@ dependencies {
|
|||||||
implementation project(':third-party')
|
implementation project(':third-party')
|
||||||
implementation project(':common:model')
|
implementation project(':common:model')
|
||||||
implementation project(':common:config')
|
implementation project(':common:config')
|
||||||
implementation project(':libraries:misc')
|
implementation project(':libraries:easy-lsh')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.language.model;
|
package nu.marginalia.language.model;
|
||||||
|
|
||||||
import nu.marginalia.model.idx.EdgePageWordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
|
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
@ -44,7 +44,7 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
|
|||||||
|
|
||||||
int positions = positionMask.getOrDefault(stemmed, 0);
|
int positions = positionMask.getOrDefault(stemmed, 0);
|
||||||
|
|
||||||
return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, tfidf.count(), flags).encode();
|
return new WordMetadata(tfidf.tfIdfNormalized(), positions, tfidf.count(), flags).encode();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -17,7 +17,7 @@ dependencies {
|
|||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
implementation libs.notnull
|
implementation libs.notnull
|
||||||
implementation libs.lz4
|
|
||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
24
libraries/random-write-funnel/build.gradle
Normal file
24
libraries/random-write-funnel/build.gradle
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(17))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
|
||||||
|
implementation libs.lombok
|
||||||
|
annotationProcessor libs.lombok
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
31
libraries/random-write-funnel/readme.md
Normal file
31
libraries/random-write-funnel/readme.md
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# Random Write Funnel
|
||||||
|
|
||||||
|
This micro-library solves the problem of [write amplification](https://en.wikipedia.org/wiki/Write_amplification) when
|
||||||
|
writing large files out of order to disk. It does this by bucketing the writes into several temporary files,
|
||||||
|
which are then evaluated to construct the larger file with a more predictable order of writes.
|
||||||
|
|
||||||
|
Even though it effectively writes 2.5x as much data to disk than simply attempting to
|
||||||
|
construct the file directly, it is *much* faster than thrashing an SSD with dozens of gigabytes
|
||||||
|
of small random writes.
|
||||||
|
|
||||||
|
## Demo
|
||||||
|
```java
|
||||||
|
try (var rfw = new RandomWriteFunnel(tmpPath, expectedSize);
|
||||||
|
var out = Files.newByteChannel(outputFile, StandardOpenOption.WRITE))
|
||||||
|
{
|
||||||
|
rwf.put(addr1, data1);
|
||||||
|
rwf.put(addr2, data2);
|
||||||
|
// ...
|
||||||
|
rwf.put(addr1e33, data1e33);
|
||||||
|
|
||||||
|
rwf.write(out);
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
//
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Central Classes
|
||||||
|
|
||||||
|
* [RandomWriteFunnel](src/main/java/nu/marginalia/rwf/RandomWriteFunnel.java)
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util;
|
package nu.marginalia.rwf;
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
@ -1,33 +1,43 @@
|
|||||||
package nu.marginalia.index.service.util;
|
package nu.marginalia.rwf;
|
||||||
|
|
||||||
import nu.marginalia.util.RandomWriteFunnel;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.RandomAccessFile;
|
import java.io.RandomAccessFile;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
class RandomWriteFunnelTest {
|
class RandomWriteFunnelTest {
|
||||||
|
|
||||||
|
Path testFile;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
testFile = Files.createTempFile(getClass().getSimpleName(), "bin");
|
||||||
|
}
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
Files.delete(testFile);
|
||||||
|
}
|
||||||
@Test
|
@Test
|
||||||
public void test() {
|
public void test() {
|
||||||
new File("/tmp/test.bin").delete();
|
|
||||||
try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), 5001);
|
try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), 5001);
|
||||||
var out = new RandomAccessFile("/tmp/test.bin", "rw")) {
|
var out = Files.newByteChannel(testFile, StandardOpenOption.WRITE)) {
|
||||||
for (int i = 10_000-1; i >= 0; i--) {
|
for (int i = 10_000-1; i >= 0; i--) {
|
||||||
System.out.println(i);
|
System.out.println(i);
|
||||||
funnel.put(i, 10_000-i);
|
funnel.put(i, 10_000-i);
|
||||||
}
|
}
|
||||||
funnel.write(out.getChannel());
|
funnel.write(out);
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var in = new RandomAccessFile("/tmp/test.bin", "r")) {
|
try (var in = new RandomAccessFile(testFile.toFile(), "r")) {
|
||||||
for (int i = 0; i < 10_000; i++) {
|
for (int i = 0; i < 10_000; i++) {
|
||||||
assertEquals(10_000-i, in.readLong());
|
assertEquals(10_000-i, in.readLong());
|
||||||
}
|
}
|
||||||
@ -38,20 +48,19 @@ class RandomWriteFunnelTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSparse() {
|
public void testSparse() {
|
||||||
new File("/tmp/test.bin").delete();
|
|
||||||
for (int j = 1; j <= 20; j++) {
|
for (int j = 1; j <= 20; j++) {
|
||||||
try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), j);
|
try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), j);
|
||||||
var out = new RandomAccessFile("/tmp/test.bin", "rw")) {
|
var out = Files.newByteChannel(testFile, StandardOpenOption.WRITE)) {
|
||||||
for (int i = 10 - 1; i >= 0; i -= 2) {
|
for (int i = 10 - 1; i >= 0; i -= 2) {
|
||||||
funnel.put(i, 10 - i);
|
funnel.put(i, 10 - i);
|
||||||
}
|
}
|
||||||
funnel.write(out.getChannel());
|
funnel.write(out);
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var in = new RandomAccessFile("/tmp/test.bin", "r")) {
|
try (var in = new RandomAccessFile(testFile.toFile(), "r")) {
|
||||||
assertEquals(0, in.readLong());
|
assertEquals(0, in.readLong());
|
||||||
assertEquals(9, in.readLong());
|
assertEquals(9, in.readLong());
|
||||||
assertEquals(0, in.readLong());
|
assertEquals(0, in.readLong());
|
||||||
@ -71,20 +80,19 @@ class RandomWriteFunnelTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testYuge() {
|
public void testYuge() {
|
||||||
new File("/tmp/test.bin").delete();
|
|
||||||
for (int j = 1; j <= 20; j++) {
|
for (int j = 1; j <= 20; j++) {
|
||||||
try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), j);
|
try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), j);
|
||||||
var out = new RandomAccessFile("/tmp/test.bin", "rw")) {
|
var out = Files.newByteChannel(testFile, StandardOpenOption.WRITE)) {
|
||||||
for (int i = 10 - 1; i >= 0; i -= 2) {
|
for (int i = 10 - 1; i >= 0; i -= 2) {
|
||||||
funnel.put(i, Long.MAX_VALUE - i);
|
funnel.put(i, Long.MAX_VALUE - i);
|
||||||
}
|
}
|
||||||
funnel.write(out.getChannel());
|
funnel.write(out);
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var in = new RandomAccessFile("/tmp/test.bin", "r")) {
|
try (var in = new RandomAccessFile(testFile.toFile(), "r")) {
|
||||||
in.readLong();
|
in.readLong();
|
||||||
in.readLong();
|
in.readLong();
|
||||||
in.readLong();
|
in.readLong();
|
@ -6,4 +6,15 @@ These are libraries that are not strongly coupled to the search engine.
|
|||||||
bad support for. It's designed to be able to easily replaced when *Java's Foreign Function And Memory API* is released.
|
bad support for. It's designed to be able to easily replaced when *Java's Foreign Function And Memory API* is released.
|
||||||
* The [btree](btree/) library offers a static BTree implementation based on the array library.
|
* The [btree](btree/) library offers a static BTree implementation based on the array library.
|
||||||
* [language-processing](language-processing/) contains primitives for sentence extraction and POS-tagging.
|
* [language-processing](language-processing/) contains primitives for sentence extraction and POS-tagging.
|
||||||
|
|
||||||
|
## Micro libraries
|
||||||
|
|
||||||
|
* [easy-lsh](easy-lsh/) is a simple locality-sensitive hash for document deduplication
|
||||||
|
* [guarded-regex](guarded-regex/) makes predicated regular expressions clearer
|
||||||
|
* [big-string](big-string/) offers seamless string compression
|
||||||
|
* [random-write-funnel](random-write-funnel/) is a tool for reducing write amplification when constructing
|
||||||
|
large files out of order.
|
||||||
|
|
||||||
|
## The rest
|
||||||
|
|
||||||
* [misc](misc/) is just random bits and bobs that didn't fit anywhere.
|
* [misc](misc/) is just random bits and bobs that didn't fit anywhere.
|
5
other/readme.md
Normal file
5
other/readme.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# Other
|
||||||
|
|
||||||
|
This code will be moved to a separate repository.
|
||||||
|
|
||||||
|
Nothing to see here, move along.
|
@ -7,7 +7,7 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
|
|||||||
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||||
import nu.marginalia.index.svc.SearchTermsService;
|
import nu.marginalia.index.svc.SearchTermsService;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
import nu.marginalia.model.idx.EdgePageWordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
|
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
|
||||||
@ -175,17 +175,17 @@ public class IndexResultValuator {
|
|||||||
return 1000;
|
return 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
positions = EdgePageWordMetadata.decodePositions(meta);
|
positions = WordMetadata.decodePositions(meta);
|
||||||
|
|
||||||
maskDirectRaw &= positions;
|
maskDirectRaw &= positions;
|
||||||
|
|
||||||
if (positions != 0 && !EdgePageWordMetadata.hasAnyFlags(meta, flagBitMask)) {
|
if (positions != 0 && !WordMetadata.hasAnyFlags(meta, flagBitMask)) {
|
||||||
maskAdjacent &= (positions | (positions << 1) | (positions >>> 1));
|
maskAdjacent &= (positions | (positions << 1) | (positions >>> 1));
|
||||||
maskDirectGenerous &= positions;
|
maskDirectGenerous &= positions;
|
||||||
}
|
}
|
||||||
|
|
||||||
termCount++;
|
termCount++;
|
||||||
tfIdfSum += EdgePageWordMetadata.decodeTfidf(meta);
|
tfIdfSum += WordMetadata.decodeTfidf(meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
double avgTfIdf = termCount / tfIdfSum;
|
double avgTfIdf = termCount / tfIdfSum;
|
||||||
|
@ -0,0 +1,77 @@
|
|||||||
|
package nu.marginalia.index.model;
|
||||||
|
|
||||||
|
|
||||||
|
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.EnumSet;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
class DocumentMetadataTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void codecYear() {
|
||||||
|
var meta = new DocumentMetadata(0, 0, 0, 192, 0, 0, (byte) 0);
|
||||||
|
long encoded = meta.encode();
|
||||||
|
var decoded = new DocumentMetadata(encoded);
|
||||||
|
assertEquals(192, decoded.year());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void codecTopology() {
|
||||||
|
var meta = new DocumentMetadata(0, 0, 192, 0, 0, 0, (byte) 0);
|
||||||
|
long encoded = meta.encode();
|
||||||
|
var decoded = new DocumentMetadata(encoded);
|
||||||
|
assertEquals(192, decoded.topology());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void codecSets() {
|
||||||
|
var meta = new DocumentMetadata(0, 0, 0, 0, 14, 0, (byte) 0);
|
||||||
|
long encoded = meta.encode();
|
||||||
|
var decoded = new DocumentMetadata(encoded);
|
||||||
|
assertEquals(14, decoded.sets());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void codecQuality() {
|
||||||
|
var meta = new DocumentMetadata(0, 0, 0, 0, 0, 9, (byte) 0);
|
||||||
|
long encoded = meta.encode();
|
||||||
|
var decoded = new DocumentMetadata(encoded);
|
||||||
|
assertEquals(9, decoded.quality());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void codecFlags() {
|
||||||
|
var meta = new DocumentMetadata(0, 0, 0, 0, 0, 0, (byte) 255);
|
||||||
|
long encoded = meta.encode();
|
||||||
|
System.out.println(Long.toHexString(encoded));
|
||||||
|
var decoded = new DocumentMetadata(encoded);
|
||||||
|
System.out.println(decoded);
|
||||||
|
assertEquals((byte) 255, decoded.flags());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void encSize() {
|
||||||
|
assertEquals(100, new DocumentMetadata(0).withSize(145).size());
|
||||||
|
assertEquals(100, DocumentMetadata.decodeSize(new DocumentMetadata(0).withSize(145).encode()));
|
||||||
|
|
||||||
|
assertEquals(50, new DocumentMetadata(0).withSize(4).size());
|
||||||
|
assertEquals(50, DocumentMetadata.decodeSize(new DocumentMetadata(0).withSize(4).encode()));
|
||||||
|
|
||||||
|
assertEquals(50 * 255, DocumentMetadata.decodeSize(new DocumentMetadata(0).withSize(Integer.MAX_VALUE).encode()));
|
||||||
|
assertEquals(50 * 255, new DocumentMetadata(0).withSize(Integer.MAX_VALUE).size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void encRank() {
|
||||||
|
var meta = new DocumentMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class))
|
||||||
|
.withSize(0xffffffff).encode();
|
||||||
|
var enc2 = DocumentMetadata.encodeRank(meta, 83);
|
||||||
|
|
||||||
|
assertEquals(83, DocumentMetadata.decodeRank(enc2));
|
||||||
|
assertEquals(5, DocumentMetadata.decodeTopology(enc2));
|
||||||
|
}
|
||||||
|
}
|
@ -1,77 +0,0 @@
|
|||||||
package nu.marginalia.index.model;
|
|
||||||
|
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import java.util.EnumSet;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
|
|
||||||
class EdgePageDocumentsMetadataTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void codecYear() {
|
|
||||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 192, 0, 0, (byte) 0);
|
|
||||||
long encoded = meta.encode();
|
|
||||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
|
||||||
assertEquals(192, decoded.year());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void codecTopology() {
|
|
||||||
var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, 0, (byte) 0);
|
|
||||||
long encoded = meta.encode();
|
|
||||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
|
||||||
assertEquals(192, decoded.topology());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void codecSets() {
|
|
||||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 14, 0, (byte) 0);
|
|
||||||
long encoded = meta.encode();
|
|
||||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
|
||||||
assertEquals(14, decoded.sets());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void codecQuality() {
|
|
||||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 9, (byte) 0);
|
|
||||||
long encoded = meta.encode();
|
|
||||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
|
||||||
assertEquals(9, decoded.quality());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void codecFlags() {
|
|
||||||
var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 0, (byte) 255);
|
|
||||||
long encoded = meta.encode();
|
|
||||||
System.out.println(Long.toHexString(encoded));
|
|
||||||
var decoded = new EdgePageDocumentsMetadata(encoded);
|
|
||||||
System.out.println(decoded);
|
|
||||||
assertEquals((byte) 255, decoded.flags());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void encSize() {
|
|
||||||
assertEquals(100, new EdgePageDocumentsMetadata(0).withSize(145).size());
|
|
||||||
assertEquals(100, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(145).encode()));
|
|
||||||
|
|
||||||
assertEquals(50, new EdgePageDocumentsMetadata(0).withSize(4).size());
|
|
||||||
assertEquals(50, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(4).encode()));
|
|
||||||
|
|
||||||
assertEquals(50 * 255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode()));
|
|
||||||
assertEquals(50 * 255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void encRank() {
|
|
||||||
var meta = new EdgePageDocumentsMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class))
|
|
||||||
.withSize(0xffffffff).encode();
|
|
||||||
var enc2 = EdgePageDocumentsMetadata.encodeRank(meta, 83);
|
|
||||||
|
|
||||||
assertEquals(83, EdgePageDocumentsMetadata.decodeRank(enc2));
|
|
||||||
assertEquals(5, EdgePageDocumentsMetadata.decodeTopology(enc2));
|
|
||||||
}
|
|
||||||
}
|
|
@ -15,8 +15,8 @@ import nu.marginalia.index.query.limit.QueryStrategy;
|
|||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
import nu.marginalia.lexicon.KeywordLexicon;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
import nu.marginalia.model.idx.EdgePageDocumentsMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.idx.EdgePageWordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.service.server.Initialization;
|
import nu.marginalia.service.server.Initialization;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
@ -164,12 +164,12 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
|
|
||||||
long fullId = id | ((long) (32 - (id % 32)) << 32);
|
long fullId = id | ((long) (32 - (id % 32)) << 32);
|
||||||
|
|
||||||
var header = new IndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
var header = new IndexJournalEntryHeader(factors.length, fullId, new DocumentMetadata(0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
long[] data = new long[factors.length*2];
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
||||||
data[2*i + 1] = new EdgePageWordMetadata(i, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
data[2*i + 1] = new WordMetadata(i, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
||||||
}
|
}
|
||||||
|
|
||||||
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
||||||
@ -177,12 +177,12 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
|
|
||||||
public void loadDataWithDomain(int domain, int id) {
|
public void loadDataWithDomain(int domain, int id) {
|
||||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||||
var header = new IndexJournalEntryHeader(factors.length, id | ((long) domain << 32), EdgePageDocumentsMetadata.defaultValue());
|
var header = new IndexJournalEntryHeader(factors.length, id | ((long) domain << 32), DocumentMetadata.defaultValue());
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
long[] data = new long[factors.length*2];
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
||||||
data[2*i + 1] = new EdgePageWordMetadata(i % 20, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
data[2*i + 1] = new WordMetadata(i % 20, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode();
|
||||||
}
|
}
|
||||||
|
|
||||||
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
||||||
|
@ -26,6 +26,7 @@ dependencies {
|
|||||||
implementation project(':index:index-query')
|
implementation project(':index:index-query')
|
||||||
|
|
||||||
implementation project(':libraries:misc')
|
implementation project(':libraries:misc')
|
||||||
|
implementation project(':libraries:easy-lsh')
|
||||||
implementation project(':libraries:language-processing')
|
implementation project(':libraries:language-processing')
|
||||||
|
|
||||||
implementation project(':api:assistant-api')
|
implementation project(':api:assistant-api')
|
||||||
@ -33,8 +34,8 @@ dependencies {
|
|||||||
implementation project(':api:search-api')
|
implementation project(':api:search-api')
|
||||||
implementation project(':common:service-discovery')
|
implementation project(':common:service-discovery')
|
||||||
implementation project(':common:service-client')
|
implementation project(':common:service-client')
|
||||||
|
implementation project(':common:renderer')
|
||||||
|
|
||||||
implementation project(':features:renderer')
|
|
||||||
implementation project(':features:screenshots')
|
implementation project(':features:screenshots')
|
||||||
implementation project(':features:random-websites')
|
implementation project(':features:random-websites')
|
||||||
implementation project(':features:query-parser')
|
implementation project(':features:query-parser')
|
||||||
|
@ -6,7 +6,6 @@ import gnu.trove.map.hash.TObjectIntHashMap;
|
|||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import nu.marginalia.search.model.UrlDetails;
|
import nu.marginalia.search.model.UrlDetails;
|
||||||
import nu.marginalia.lsh.EasyLSH;
|
import nu.marginalia.lsh.EasyLSH;
|
||||||
import nu.marginalia.util.BrailleBlockPunchCards;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user