diff --git a/.gitignore b/.gitignore index 5e039277..a970eb30 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ build/ *~ .gradle/ .idea/ +lombok.config +Dockerfile +run diff --git a/README.md b/README.md index 91f9f67e..ba8bf42b 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,38 @@ # marginalia.nu -This is the source code for marginalia.nu, including the [search engine](https://search.marginalia.nu), -the [MEMEX/gemini server](https://memex.marginalia.nu), the and the [encyclopedia service](https://encyclopedia.marginalia.nu). +This is the source code for [Marginalia Search](https://search.marginalia.nu). The aim of the project is to develop new and alternative discovery methods for the Internet. It's an experimental workshop as much as it is a public service, the overarching goal is to elevate the more human, non-commercial sides of the Internet. A side-goal is to do this without requiring datacenters and expensive enterprise hardware, to run this operation on affordable hardware. -The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu). -It is fine to mirror it on other hosts, but if you have issues or questions -git.marginalia.nu is where you want to go. +## Set up instructions -## Important note about wmsa.local +For local development, you're strongly encouraged to use docker or podman. +From a fresh to running system, you'll need to do this: -This project has a [sister repository called wmsa.local](https://git.marginalia.nu/marginalia/wmsa.local) -that contains scripts and configuration files for running and developing the code. +``` +$ ./gradlew assemble -Without it, development is very unpleasant. +$ ./gradlew docker -While developing the code, you will want an environment variable WMSA_HOME pointing to -the directory in which wmsa.local is checked out, otherwise the code will not run and -several tests will fail. +$ vim run/settings.profile + +(follow instructions in file) + +$ run/setup.sh + +$ run/reconvert.sh + +$ docker-compose up +``` + +Wait a moment and check out [https://localhost:8080](https://localhost:8080). ## Documentation -Documentation is a work in progress. See the [wiki](https://git.marginalia.nu/marginalia/marginalia.nu/wiki). +Documentation is a work in progress. ## Contributing diff --git a/api/assistant-api/build.gradle b/api/assistant-api/build.gradle new file mode 100644 index 00000000..9ec260b1 --- /dev/null +++ b/api/assistant-api/build.gradle @@ -0,0 +1,45 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:config') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guice + implementation libs.rxjava + implementation libs.gson + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java b/api/assistant-api/src/main/java/nu/marginalia/assistant/client/AssistantClient.java similarity index 56% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java rename to api/assistant-api/src/main/java/nu/marginalia/assistant/client/AssistantClient.java index 63f8e255..94677317 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java +++ b/api/assistant-api/src/main/java/nu/marginalia/assistant/client/AssistantClient.java @@ -1,28 +1,32 @@ -package nu.marginalia.wmsa.edge.assistant.client; +package nu.marginalia.assistant.client; import com.google.inject.Inject; import com.google.inject.Singleton; import io.reactivex.rxjava3.core.Observable; -import nu.marginalia.wmsa.client.AbstractDynamicClient; -import nu.marginalia.wmsa.client.exception.RouteNotConfiguredException; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.assistant.dict.DictionaryResponse; -import org.eclipse.jetty.util.UrlEncoded; +import nu.marginalia.assistant.client.model.DictionaryResponse; +import nu.marginalia.client.AbstractDynamicClient; +import nu.marginalia.client.exception.RouteNotConfiguredException; +import nu.marginalia.WmsaHome; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.service.descriptor.ServiceDescriptors; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.client.Context; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; import java.util.List; @Singleton public class AssistantClient extends AbstractDynamicClient { @Inject - public AssistantClient() { - super(ServiceDescriptor.EDGE_ASSISTANT); + public AssistantClient(ServiceDescriptors descriptors) { + super(descriptors.forId(ServiceId.Assistant), WmsaHome.getHostsFile(), GsonFactory::get); } public Observable dictionaryLookup(Context ctx, String word) { try { - return super.get(ctx, "/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class); + return super.get(ctx, "/dictionary/" + URLEncoder.encode(word, StandardCharsets.UTF_8), DictionaryResponse.class); } catch (RouteNotConfiguredException ex) { return Observable.empty(); @@ -32,7 +36,7 @@ public class AssistantClient extends AbstractDynamicClient { @SuppressWarnings("unchecked") public Observable> spellCheck(Context ctx, String word) { try { - return (Observable>) (Object) super.get(ctx, "/spell-check/" + UrlEncoded.encodeString(word), List.class); + return (Observable>) (Object) super.get(ctx, "/spell-check/" + URLEncoder.encode(word, StandardCharsets.UTF_8), List.class); } catch (RouteNotConfiguredException ex) { return Observable.empty(); @@ -49,7 +53,7 @@ public class AssistantClient extends AbstractDynamicClient { public Observable evalMath(Context ctx, String expression) { try { - return super.get(ctx, "/eval-expression?value=" + UrlEncoded.encodeString(expression)); + return super.get(ctx, "/eval-expression?value=" + URLEncoder.encode(expression, StandardCharsets.UTF_8)); } catch (RouteNotConfiguredException ex) { return Observable.empty(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryEntry.java b/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DictionaryEntry.java similarity index 83% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryEntry.java rename to api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DictionaryEntry.java index 88c48986..c40ea97f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryEntry.java +++ b/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DictionaryEntry.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.assistant.dict; +package nu.marginalia.assistant.client.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryResponse.java b/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DictionaryResponse.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryResponse.java rename to api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DictionaryResponse.java index 624782a6..03fbd2e6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryResponse.java +++ b/api/assistant-api/src/main/java/nu/marginalia/assistant/client/model/DictionaryResponse.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.assistant.dict; +package nu.marginalia.assistant.client.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/api/index-api/build.gradle b/api/index-api/build.gradle new file mode 100644 index 00000000..35616eee --- /dev/null +++ b/api/index-api/build.gradle @@ -0,0 +1,48 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:config') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + + implementation project(':index:index-query') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guice + implementation libs.rxjava + implementation libs.protobuf + implementation libs.gson + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} \ No newline at end of file diff --git a/api/index-api/src/main/java/nu/marginalia/index/client/EdgeIndexClient.java b/api/index-api/src/main/java/nu/marginalia/index/client/EdgeIndexClient.java new file mode 100644 index 00000000..5ed45885 --- /dev/null +++ b/api/index-api/src/main/java/nu/marginalia/index/client/EdgeIndexClient.java @@ -0,0 +1,45 @@ +package nu.marginalia.index.client; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import io.prometheus.client.Summary; +import io.reactivex.rxjava3.core.Observable; +import nu.marginalia.WmsaHome; +import nu.marginalia.client.AbstractDynamicClient; +import nu.marginalia.client.Context; +import nu.marginalia.index.client.model.query.EdgeSearchSpecification; +import nu.marginalia.index.client.model.results.EdgeSearchResultItem; +import nu.marginalia.index.client.model.results.EdgeSearchResultSet; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.service.descriptor.ServiceDescriptors; +import nu.marginalia.service.id.ServiceId; + +import javax.annotation.CheckReturnValue; +import java.util.List; + +@Singleton +public class EdgeIndexClient extends AbstractDynamicClient { + + private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register(); + + @Inject + public EdgeIndexClient(ServiceDescriptors descriptors) { + super(descriptors.forId(ServiceId.Index), WmsaHome.getHostsFile(), GsonFactory::get); + + setTimeout(30); + } + + @CheckReturnValue + public List query(Context ctx, EdgeSearchSpecification specs) { + return wmsa_search_index_api_time.time( + () -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults() + ); + } + + + @CheckReturnValue + public Observable isBlocked(Context ctx) { + return super.get(ctx, "/is-blocked", Boolean.class); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchResults.java b/api/index-api/src/main/java/nu/marginalia/index/client/model/domain/EdgeDomainSearchResults.java similarity index 61% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchResults.java rename to api/index-api/src/main/java/nu/marginalia/index/client/model/domain/EdgeDomainSearchResults.java index e9079bb7..0b994f81 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchResults.java +++ b/api/index-api/src/main/java/nu/marginalia/index/client/model/domain/EdgeDomainSearchResults.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.model.search.domain; +package nu.marginalia.index.client.model.domain; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeIdList; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.EdgeIdList; @AllArgsConstructor @Getter @ToString public class EdgeDomainSearchResults { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchSpecification.java b/api/index-api/src/main/java/nu/marginalia/index/client/model/domain/EdgeDomainSearchSpecification.java similarity index 83% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchSpecification.java rename to api/index-api/src/main/java/nu/marginalia/index/client/model/domain/EdgeDomainSearchSpecification.java index 2c4738f9..29748632 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchSpecification.java +++ b/api/index-api/src/main/java/nu/marginalia/index/client/model/domain/EdgeDomainSearchSpecification.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.search.domain; +package nu.marginalia.index.client.model.domain; import lombok.AllArgsConstructor; import lombok.ToString; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java b/api/index-api/src/main/java/nu/marginalia/index/client/model/query/EdgeSearchSpecification.java similarity index 65% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java rename to api/index-api/src/main/java/nu/marginalia/index/client/model/query/EdgeSearchSpecification.java index 84f133d7..bfafb75b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java +++ b/api/index-api/src/main/java/nu/marginalia/index/client/model/query/EdgeSearchSpecification.java @@ -1,10 +1,9 @@ -package nu.marginalia.wmsa.edge.model.search; +package nu.marginalia.index.client.model.query; import lombok.*; -import nu.marginalia.wmsa.edge.index.model.QueryLimits; -import nu.marginalia.wmsa.edge.index.model.QueryStrategy; -import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; -import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; +import nu.marginalia.index.query.limit.QueryLimits; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.query.limit.SpecificationLimit; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSubquery.java b/api/index-api/src/main/java/nu/marginalia/index/client/model/query/EdgeSearchSubquery.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSubquery.java rename to api/index-api/src/main/java/nu/marginalia/index/client/model/query/EdgeSearchSubquery.java index b171c5a3..33416001 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSubquery.java +++ b/api/index-api/src/main/java/nu/marginalia/index/client/model/query/EdgeSearchSubquery.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.search; +package nu.marginalia.index.client.model.query; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetIdentifier.java b/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java similarity index 64% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetIdentifier.java rename to api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java index 040cd32c..aca5c291 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetIdentifier.java +++ b/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java @@ -1,6 +1,4 @@ -package nu.marginalia.wmsa.edge.index.svc.searchset; - -import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; +package nu.marginalia.index.client.model.query; /** Identifies a RankingSearchSet, associated with an EdgeSearchProfile * diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java b/api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResultItem.java similarity index 83% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java rename to api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResultItem.java index 517c1975..e9f1e1be 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java +++ b/api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResultItem.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.model.search; +package nu.marginalia.index.client.model.results; import lombok.AllArgsConstructor; import lombok.Getter; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeId; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.EdgeId; import java.util.ArrayList; import java.util.List; @@ -41,7 +41,7 @@ public class EdgeSearchResultItem { return scoreValue; } - private transient int domainId = 0; + private transient int domainId = Integer.MIN_VALUE; public void setDomainId(int domainId) { this.domainId = domainId; } @@ -69,12 +69,12 @@ public class EdgeSearchResultItem { } public long deduplicationKey() { - final int ranking = getDomainId(); + final int domainId = getDomainId(); - if (ranking == Integer.MAX_VALUE || ranking == Integer.MIN_VALUE) { + if (domainId == Integer.MAX_VALUE || domainId == Integer.MIN_VALUE) { return 0; } - return ranking; + return domainId; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java b/api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResultKeywordScore.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java rename to api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResultKeywordScore.java index 6d97192c..0fed4a7f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java +++ b/api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResultKeywordScore.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.model.search; +package nu.marginalia.index.client.model.results; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentFlags; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; +import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.EdgePageWordMetadata; +import nu.marginalia.model.crawl.EdgePageDocumentFlags; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import static java.lang.Integer.lowestOneBit; import static java.lang.Integer.numberOfTrailingZeros; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultSet.java b/api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResultSet.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultSet.java rename to api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResultSet.java index be12ab31..e69fd34d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultSet.java +++ b/api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResultSet.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.search; +package nu.marginalia.index.client.model.results; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResults.java b/api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResults.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResults.java rename to api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResults.java index 00b51df4..2e54a25c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResults.java +++ b/api/index-api/src/main/java/nu/marginalia/index/client/model/results/EdgeSearchResults.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.search; +package nu.marginalia.index.client.model.results; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/api/readme.md b/api/readme.md new file mode 100644 index 00000000..a49ddd63 --- /dev/null +++ b/api/readme.md @@ -0,0 +1,7 @@ +# Core Service Clients + +These are clients for the [core services](../services-core/), along with what models +are necessary for speaking to them. + +All that is necessary is to `@Inject` them into the constructor and then +requests can be sent. \ No newline at end of file diff --git a/api/search-api/build.gradle b/api/search-api/build.gradle new file mode 100644 index 00000000..613cea82 --- /dev/null +++ b/api/search-api/build.gradle @@ -0,0 +1,46 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:config') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guice + implementation libs.rxjava + implementation libs.gson + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/client/EdgeSearchClient.java b/api/search-api/src/main/java/nu/marginalia/search/client/EdgeSearchClient.java similarity index 56% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/client/EdgeSearchClient.java rename to api/search-api/src/main/java/nu/marginalia/search/client/EdgeSearchClient.java index 3e38c3f6..c643cbe5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/client/EdgeSearchClient.java +++ b/api/search-api/src/main/java/nu/marginalia/search/client/EdgeSearchClient.java @@ -1,11 +1,15 @@ -package nu.marginalia.wmsa.edge.search.client; +package nu.marginalia.search.client; +import com.google.inject.Inject; import com.google.inject.Singleton; import io.reactivex.rxjava3.core.Observable; -import nu.marginalia.wmsa.api.model.ApiSearchResults; -import nu.marginalia.wmsa.client.AbstractDynamicClient; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.client.AbstractDynamicClient; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.search.client.model.ApiSearchResults; +import nu.marginalia.service.descriptor.ServiceDescriptors; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.WmsaHome; +import nu.marginalia.client.Context; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,8 +21,9 @@ import java.nio.charset.StandardCharsets; public class EdgeSearchClient extends AbstractDynamicClient { private final Logger logger = LoggerFactory.getLogger(getClass()); - public EdgeSearchClient() { - super(ServiceDescriptor.EDGE_SEARCH); + @Inject + public EdgeSearchClient(ServiceDescriptors descriptors) { + super(descriptors.forId(ServiceId.Search), WmsaHome.getHostsFile(), GsonFactory::get); } @CheckReturnValue diff --git a/api/search-api/src/main/java/nu/marginalia/search/client/model/ApiSearchResult.java b/api/search-api/src/main/java/nu/marginalia/search/client/model/ApiSearchResult.java new file mode 100644 index 00000000..bedc3046 --- /dev/null +++ b/api/search-api/src/main/java/nu/marginalia/search/client/model/ApiSearchResult.java @@ -0,0 +1,18 @@ +package nu.marginalia.search.client.model; + +import lombok.AllArgsConstructor; +import lombok.Getter; + +import java.util.ArrayList; +import java.util.List; + +@AllArgsConstructor @Getter +public class ApiSearchResult { + public String url; + public String title; + public String description; + public double quality; + + public List> details = new ArrayList<>(); + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResultQueryDetails.java b/api/search-api/src/main/java/nu/marginalia/search/client/model/ApiSearchResultQueryDetails.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResultQueryDetails.java rename to api/search-api/src/main/java/nu/marginalia/search/client/model/ApiSearchResultQueryDetails.java index ad146ca8..ee13c219 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResultQueryDetails.java +++ b/api/search-api/src/main/java/nu/marginalia/search/client/model/ApiSearchResultQueryDetails.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.api.model; +package nu.marginalia.search.client.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResults.java b/api/search-api/src/main/java/nu/marginalia/search/client/model/ApiSearchResults.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResults.java rename to api/search-api/src/main/java/nu/marginalia/search/client/model/ApiSearchResults.java index db696c73..688e9e91 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResults.java +++ b/api/search-api/src/main/java/nu/marginalia/search/client/model/ApiSearchResults.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.api.model; +package nu.marginalia.search.client.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/build.gradle b/build.gradle index ffe47e69..d6740cf8 100644 --- a/build.gradle +++ b/build.gradle @@ -1,42 +1,16 @@ plugins { id 'java' - - id 'com.github.johnrengelman.shadow' version '6.0.0' } -group 'nu.marginalia' +group 'marginalia' version 'SNAPSHOT' + compileJava.options.encoding = "UTF-8" compileTestJava.options.encoding = "UTF-8" -repositories { - mavenLocal() - maven { url "https://artifactory.cronapp.io/public-release/" } - maven { url "https://repo1.maven.org/maven2/" } - maven { url "https://www2.ph.ed.ac.uk/maven2/" } - maven { url "https://jitpack.io/" } - exclusiveContent { - forRepository { - maven { - url = uri("https://jitpack.io") - } - } - filter { - // Only use JitPack for the `gson-record-type-adapter-factory` library - includeModule("com.github.Marcono1234", "gson-record-type-adapter-factory") - } - } -} -shadowJar { - zip64 true -} -jar { - manifest { - attributes 'Main-Class': "nu.marginalia.wmsa.configuration.ServiceDescriptor" - } - from { - configurations.shadow.collect { it.isDirectory() ? it : zipTree(it) } - } +task dist(type: Copy) { + from subprojects.collect { it.tasks.withType(Tar) } + into "$buildDir/dist" } java { @@ -44,19 +18,3 @@ java { languageVersion.set(JavaLanguageVersion.of(17)) } } - -dependencies { - implementation project(':marginalia_nu') -} -task version() { // -} - -test { - maxParallelForks = 16 - forkEvery = 1 - maxHeapSize = "8G" - useJUnitPlatform { - excludeTags "nobuild" - } -} - diff --git a/common/config/build.gradle b/common/config/build.gradle new file mode 100644 index 00000000..211a3465 --- /dev/null +++ b/common/config/build.gradle @@ -0,0 +1,32 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':common:service-discovery') + implementation project(':common:service-client') + implementation project(':libraries:misc') +} + +test { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform() +} + +task fastTests(type: Test) { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/common/config/src/main/java/nu/marginalia/LanguageModels.java b/common/config/src/main/java/nu/marginalia/LanguageModels.java new file mode 100644 index 00000000..0220e7a2 --- /dev/null +++ b/common/config/src/main/java/nu/marginalia/LanguageModels.java @@ -0,0 +1,22 @@ +package nu.marginalia; + +import java.nio.file.Path; + +public class LanguageModels { + public final Path ngramBloomFilter; + public final Path termFrequencies; + + public final Path openNLPSentenceDetectionData; + public final Path posRules; + public final Path posDict; + public final Path openNLPTokenData; + + public LanguageModels(Path ngramBloomFilter, Path termFrequencies, Path openNLPSentenceDetectionData, Path posRules, Path posDict, Path openNLPTokenData) { + this.ngramBloomFilter = ngramBloomFilter; + this.termFrequencies = termFrequencies; + this.openNLPSentenceDetectionData = openNLPSentenceDetectionData; + this.posRules = posRules; + this.posDict = posDict; + this.openNLPTokenData = openNLPTokenData; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/UserAgent.java b/common/config/src/main/java/nu/marginalia/UserAgent.java similarity index 52% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/UserAgent.java rename to common/config/src/main/java/nu/marginalia/UserAgent.java index d4dfd54c..75c1aa7d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/UserAgent.java +++ b/common/config/src/main/java/nu/marginalia/UserAgent.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.configuration; +package nu.marginalia; public record UserAgent(String uaString) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WebsiteUrl.java b/common/config/src/main/java/nu/marginalia/WebsiteUrl.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WebsiteUrl.java rename to common/config/src/main/java/nu/marginalia/WebsiteUrl.java index 8e3f8c4c..5772cc56 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WebsiteUrl.java +++ b/common/config/src/main/java/nu/marginalia/WebsiteUrl.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.configuration; +package nu.marginalia; public record WebsiteUrl(String url) { public String withPath(String path) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java b/common/config/src/main/java/nu/marginalia/WmsaHome.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java rename to common/config/src/main/java/nu/marginalia/WmsaHome.java index bbf7ccbc..6c9eb537 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java +++ b/common/config/src/main/java/nu/marginalia/WmsaHome.java @@ -1,6 +1,7 @@ -package nu.marginalia.wmsa.configuration; +package nu.marginalia; -import nu.marginalia.util.language.conf.LanguageModels; + +import nu.marginalia.service.descriptor.HostsFile; import java.io.FileNotFoundException; import java.io.IOException; diff --git a/common/model/build.gradle b/common/model/build.gradle new file mode 100644 index 00000000..072227e1 --- /dev/null +++ b/common/model/build.gradle @@ -0,0 +1,53 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':common:service-discovery') + implementation project(':common:service-client') + implementation project(':libraries:misc') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.guice + implementation libs.bundles.gson + + implementation libs.notnull + + implementation libs.commons.lang3 + + implementation libs.trove + implementation libs.fastutil + + implementation libs.rxjava + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform() +} + +task fastTests(type: Test) { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform { + excludeTags "slow" + } +} + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java rename to common/model/src/main/java/nu/marginalia/model/EdgeDomain.java index 79e65476..96a44718 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model; +package nu.marginalia.model; import lombok.*; @@ -10,8 +10,6 @@ import java.util.regex.Pattern; @AllArgsConstructor @Getter @Setter @Builder public class EdgeDomain { - - @Nonnull public final String subDomain; @Nonnull @@ -109,6 +107,7 @@ public class EdgeDomain { } return domain.substring(0, cutPoint).toLowerCase(); } + public String getLongDomainKey() { StringBuilder ret = new StringBuilder(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java rename to common/model/src/main/java/nu/marginalia/model/EdgeUrl.java index 84246c4e..7c5f3df0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.model; +package nu.marginalia.model; import lombok.Builder; import lombok.Getter; import lombok.Setter; -import nu.marginalia.wmsa.edge.converting.processor.logic.QueryParams; +import nu.marginalia.util.QueryParams; import java.net.URI; import java.net.URISyntaxException; @@ -12,7 +12,7 @@ import java.util.Optional; import java.util.regex.Pattern; @Getter @Setter @Builder -public class EdgeUrl implements WideHashable { +public class EdgeUrl { public final String proto; public final EdgeDomain domain; public final Integer port; @@ -158,12 +158,6 @@ public class EdgeUrl implements WideHashable { return path.replaceAll(".*/", ""); } - public long wideHash() { - long domainHash = domain.hashCode(); - long thisHash = hashCode(); - return (domainHash << 32) | thisHash; - } - public int depth() { return (int) path.chars().filter(c -> c=='/').count(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java b/common/model/src/main/java/nu/marginalia/model/crawl/DocumentKeywords.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java rename to common/model/src/main/java/nu/marginalia/model/crawl/DocumentKeywords.java index 7faefb2e..ce154aa5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java +++ b/common/model/src/main/java/nu/marginalia/model/crawl/DocumentKeywords.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +package nu.marginalia.model.crawl; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; + +import nu.marginalia.model.idx.EdgePageWordMetadata; import java.util.Arrays; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeContentType.java b/common/model/src/main/java/nu/marginalia/model/crawl/EdgeContentType.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeContentType.java rename to common/model/src/main/java/nu/marginalia/model/crawl/EdgeContentType.java index 70978166..4d447038 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeContentType.java +++ b/common/model/src/main/java/nu/marginalia/model/crawl/EdgeContentType.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.crawl; +package nu.marginalia.model.crawl; import lombok.*; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java b/common/model/src/main/java/nu/marginalia/model/crawl/EdgeDomainIndexingState.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java rename to common/model/src/main/java/nu/marginalia/model/crawl/EdgeDomainIndexingState.java index 6a57a871..448641fb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainIndexingState.java +++ b/common/model/src/main/java/nu/marginalia/model/crawl/EdgeDomainIndexingState.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.crawl; +package nu.marginalia.model.crawl; public enum EdgeDomainIndexingState { ACTIVE("Active"), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainLink.java b/common/model/src/main/java/nu/marginalia/model/crawl/EdgeDomainLink.java similarity index 68% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainLink.java rename to common/model/src/main/java/nu/marginalia/model/crawl/EdgeDomainLink.java index 7486fec3..b66064bf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeDomainLink.java +++ b/common/model/src/main/java/nu/marginalia/model/crawl/EdgeDomainLink.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.model.crawl; +package nu.marginalia.model.crawl; import lombok.*; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.model.EdgeDomain; @AllArgsConstructor @EqualsAndHashCode @Getter @Setter @Builder @ToString public class EdgeDomainLink { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeHtmlStandard.java b/common/model/src/main/java/nu/marginalia/model/crawl/EdgeHtmlStandard.java similarity index 71% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeHtmlStandard.java rename to common/model/src/main/java/nu/marginalia/model/crawl/EdgeHtmlStandard.java index dd3d0cec..17788cca 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeHtmlStandard.java +++ b/common/model/src/main/java/nu/marginalia/model/crawl/EdgeHtmlStandard.java @@ -1,4 +1,5 @@ -package nu.marginalia.wmsa.edge.model.crawl; +package nu.marginalia.model.crawl; + public enum EdgeHtmlStandard { PLAIN(0, 1, 1993), @@ -8,9 +9,13 @@ public enum EdgeHtmlStandard { XHTML(-0.1, 1.05, 2006), HTML5(0.5, 1.1, 2018); + /** Used to tune quality score */ public final double offset; + /** Used to tune quality score */ public final double scale; + /** This parameter is used to bias publish date heuristics + * */ public final int yearGuess; EdgeHtmlStandard(double offset, double scale, int yearGuess) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentFlags.java b/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageDocumentFlags.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentFlags.java rename to common/model/src/main/java/nu/marginalia/model/crawl/EdgePageDocumentFlags.java index 0f7a68fa..04f55edf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentFlags.java +++ b/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageDocumentFlags.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.model; +package nu.marginalia.model.crawl; import java.util.EnumSet; @@ -6,7 +6,7 @@ public enum EdgePageDocumentFlags { /** Simple processing was done, this document should be de-prioritized as a search result */ Simple, - UnusedBit1, + PlainText, UnusedBit2, UnusedBit3, UnusedBit4, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java b/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageWordFlags.java similarity index 78% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java rename to common/model/src/main/java/nu/marginalia/model/crawl/EdgePageWordFlags.java index a3f443c5..089e02af 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java +++ b/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageWordFlags.java @@ -1,9 +1,5 @@ -package nu.marginalia.wmsa.edge.index.model; +package nu.marginalia.model.crawl; -import nu.marginalia.util.language.processing.KeywordCounter; -import nu.marginalia.util.language.processing.NameCounter; -import nu.marginalia.util.language.processing.SubjectCounter; -import nu.marginalia.wmsa.edge.converting.processor.SiteWords; import java.util.EnumSet; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java b/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageWords.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java rename to common/model/src/main/java/nu/marginalia/model/crawl/EdgePageWords.java index 0db772da..60b02c56 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java +++ b/common/model/src/main/java/nu/marginalia/model/crawl/EdgePageWords.java @@ -1,10 +1,8 @@ -package nu.marginalia.wmsa.edge.model.crawl; +package nu.marginalia.model.crawl; import gnu.trove.list.array.TLongArrayList; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; import java.util.ArrayList; import java.util.Collection; @@ -39,7 +37,7 @@ public class EdgePageWords { List emptyMeta = new ArrayList<>(entries.size()); for (int i = 0; i < entries.size(); i++) { - emptyMeta.add(EdgePageWordMetadata.emptyValue()); + emptyMeta.add(0L); } return new EdgePageWords(entries, emptyMeta); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeUrlState.java b/common/model/src/main/java/nu/marginalia/model/crawl/EdgeUrlState.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeUrlState.java rename to common/model/src/main/java/nu/marginalia/model/crawl/EdgeUrlState.java index f9b57be3..07802e5c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeUrlState.java +++ b/common/model/src/main/java/nu/marginalia/model/crawl/EdgeUrlState.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.crawl; +package nu.marginalia.model.crawl; /** This should correspond to EC_URL.STATE */ public enum EdgeUrlState { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java b/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java rename to common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java index c4d0181c..4bdb5ca1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java +++ b/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.model.crawl; import java.util.Collection; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java b/common/model/src/main/java/nu/marginalia/model/crawl/PubDate.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java rename to common/model/src/main/java/nu/marginalia/model/crawl/PubDate.java index 7913e710..67f67105 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java +++ b/common/model/src/main/java/nu/marginalia/model/crawl/PubDate.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; +package nu.marginalia.model.crawl; import java.time.LocalDate; import java.time.format.DateTimeFormatter; diff --git a/common/model/src/main/java/nu/marginalia/model/dbcommon/DbDomainQueries.java b/common/model/src/main/java/nu/marginalia/model/dbcommon/DbDomainQueries.java new file mode 100644 index 00000000..dc1f015c --- /dev/null +++ b/common/model/src/main/java/nu/marginalia/model/dbcommon/DbDomainQueries.java @@ -0,0 +1,64 @@ +package nu.marginalia.model.dbcommon; + + +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.google.common.util.concurrent.UncheckedExecutionException; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.id.EdgeId; + +import java.util.NoSuchElementException; +import java.util.Optional; + +@Singleton +public class DbDomainQueries { + private final HikariDataSource dataSource; + + private final Cache> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build(); + + @Inject + public DbDomainQueries(HikariDataSource dataSource) + { + this.dataSource = dataSource; + } + + + @SneakyThrows + public EdgeId getDomainId(EdgeDomain domain) { + try (var connection = dataSource.getConnection()) { + + return domainIdCache.get(domain, () -> { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { + stmt.setString(1, domain.toString()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return new EdgeId<>(rsp.getInt(1)); + } + } + throw new NoSuchElementException(); + }); + } + catch (UncheckedExecutionException ex) { + throw ex.getCause(); + } + } + + @SneakyThrows + public Optional getDomain(EdgeId id) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, id.id()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return Optional.of(new EdgeDomain(rsp.getString(1))); + } + return Optional.empty(); + } + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklist.java b/common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklist.java similarity index 74% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklist.java rename to common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklist.java index 5fa6f193..f659a57a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklist.java +++ b/common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklist.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.dbcommon; +package nu.marginalia.model.dbcommon; import com.google.inject.ImplementedBy; import gnu.trove.set.hash.TIntHashSet; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.id.EdgeId; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.id.EdgeId; @ImplementedBy(EdgeDomainBlacklistImpl.class) public interface EdgeDomainBlacklist { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklistImpl.java b/common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklistImpl.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklistImpl.java rename to common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklistImpl.java index 13d7080f..053ced8e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklistImpl.java +++ b/common/model/src/main/java/nu/marginalia/model/dbcommon/EdgeDomainBlacklistImpl.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.dbcommon; +package nu.marginalia.model.dbcommon; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -6,6 +6,7 @@ import com.zaxxer.hikari.HikariDataSource; import gnu.trove.set.hash.TIntHashSet; import io.reactivex.rxjava3.schedulers.Schedulers; import lombok.SneakyThrows; +import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -67,5 +68,4 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist { return false; } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/GsonFactory.java b/common/model/src/main/java/nu/marginalia/model/gson/GsonFactory.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/client/GsonFactory.java rename to common/model/src/main/java/nu/marginalia/model/gson/GsonFactory.java index 393b2ea5..04496ff8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/GsonFactory.java +++ b/common/model/src/main/java/nu/marginalia/model/gson/GsonFactory.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.client; +package nu.marginalia.model.gson; import com.google.gson.*; import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; -import nu.marginalia.util.bigstring.BigString; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeId; +import nu.marginalia.bigstring.BigString; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.EdgeId; import java.net.URISyntaxException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeId.java b/common/model/src/main/java/nu/marginalia/model/id/EdgeId.java similarity index 74% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeId.java rename to common/model/src/main/java/nu/marginalia/model/id/EdgeId.java index 90f978ed..9e45c78f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeId.java +++ b/common/model/src/main/java/nu/marginalia/model/id/EdgeId.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.id; +package nu.marginalia.model.id; /** diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdArray.java b/common/model/src/main/java/nu/marginalia/model/id/EdgeIdArray.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdArray.java rename to common/model/src/main/java/nu/marginalia/model/id/EdgeIdArray.java index d7e1f0f1..078dcdb6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdArray.java +++ b/common/model/src/main/java/nu/marginalia/model/id/EdgeIdArray.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.id; +package nu.marginalia.model.id; import java.util.Arrays; import java.util.stream.IntStream; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdCollection.java b/common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollection.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdCollection.java rename to common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollection.java index ef30b78d..a8403cc6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdCollection.java +++ b/common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollection.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.id; +package nu.marginalia.model.id; import java.util.Arrays; import java.util.stream.IntStream; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdCollectionMutable.java b/common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollectionMutable.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdCollectionMutable.java rename to common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollectionMutable.java index 24da0714..0056cb28 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdCollectionMutable.java +++ b/common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollectionMutable.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.id; +package nu.marginalia.model.id; import gnu.trove.TIntCollection; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdList.java b/common/model/src/main/java/nu/marginalia/model/id/EdgeIdList.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdList.java rename to common/model/src/main/java/nu/marginalia/model/id/EdgeIdList.java index e61076a6..295854ec 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdList.java +++ b/common/model/src/main/java/nu/marginalia/model/id/EdgeIdList.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.id; +package nu.marginalia.model.id; import gnu.trove.TIntCollection; import gnu.trove.list.array.TIntArrayList; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdSet.java b/common/model/src/main/java/nu/marginalia/model/id/EdgeIdSet.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdSet.java rename to common/model/src/main/java/nu/marginalia/model/id/EdgeIdSet.java index 98ffd2ac..5119b5c7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/id/EdgeIdSet.java +++ b/common/model/src/main/java/nu/marginalia/model/id/EdgeIdSet.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.id; +package nu.marginalia.model.id; import gnu.trove.TIntCollection; import gnu.trove.set.hash.TIntHashSet; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java b/common/model/src/main/java/nu/marginalia/model/idx/EdgePageDocumentsMetadata.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java rename to common/model/src/main/java/nu/marginalia/model/idx/EdgePageDocumentsMetadata.java index 4331131f..1100a49f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java +++ b/common/model/src/main/java/nu/marginalia/model/idx/EdgePageDocumentsMetadata.java @@ -1,6 +1,7 @@ -package nu.marginalia.wmsa.edge.index.model; +package nu.marginalia.model.idx; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.model.crawl.EdgePageDocumentFlags; +import nu.marginalia.model.crawl.PubDate; import java.util.EnumSet; import java.util.Set; @@ -107,7 +108,6 @@ public record EdgePageDocumentsMetadata(int rank, return (int) ((encoded >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK); } public static int decodeYear(long encoded) { - return PubDate.fromYearByte((int) ((encoded >>> YEAR_SHIFT) & YEAR_MASK)); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java b/common/model/src/main/java/nu/marginalia/model/idx/EdgePageWordMetadata.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java rename to common/model/src/main/java/nu/marginalia/model/idx/EdgePageWordMetadata.java index 579dda92..7f8aff2a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java +++ b/common/model/src/main/java/nu/marginalia/model/idx/EdgePageWordMetadata.java @@ -1,5 +1,6 @@ -package nu.marginalia.wmsa.edge.index.model; +package nu.marginalia.model.idx; +import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.util.BrailleBlockPunchCards; import java.util.EnumSet; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java b/common/model/src/main/java/nu/marginalia/util/DenseBitMap.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java rename to common/model/src/main/java/nu/marginalia/util/DenseBitMap.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/KeywordListChunker.java b/common/model/src/main/java/nu/marginalia/util/KeywordListChunker.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/KeywordListChunker.java rename to common/model/src/main/java/nu/marginalia/util/KeywordListChunker.java index 1e30055f..1f7af342 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/KeywordListChunker.java +++ b/common/model/src/main/java/nu/marginalia/util/KeywordListChunker.java @@ -1,4 +1,6 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +package nu.marginalia.util; + +import nu.marginalia.model.crawl.DocumentKeywords; import java.util.ArrayList; import java.util.Collections; diff --git a/common/model/src/main/java/nu/marginalia/util/LineUtils.java b/common/model/src/main/java/nu/marginalia/util/LineUtils.java new file mode 100644 index 00000000..0bb785a0 --- /dev/null +++ b/common/model/src/main/java/nu/marginalia/util/LineUtils.java @@ -0,0 +1,47 @@ +package nu.marginalia.util; + +import java.util.ArrayList; +import java.util.List; + +public class LineUtils { + + /** LF, CR, CRLF, LFCR-agnostic string-line splitter that preserves empty lines + * that does not create a huge blob of a last item like String$split(regex, n) + * + */ + public static List firstNLines(String documentBody, int numLines) { + List lines = new ArrayList<>(numLines); + + boolean eatCr = false; + boolean eatLf = false; + int startPos = 0; + + for (int pos = 0; pos < documentBody.length() && lines.size() < numLines; pos++) { + int cp = documentBody.charAt(pos); + if (cp == '\r') { + if (eatCr) { + eatCr = false; + } + else { + eatLf = true; + lines.add(documentBody.substring(startPos, pos)); + } + startPos = pos + 1; + } else if (cp == '\n') { + if (eatLf) { + eatLf = false; + } + else { + eatCr = true; + lines.add(documentBody.substring(startPos, pos)); + } + startPos = pos + 1; + } else { + eatCr = eatLf = false; + } + } + + return lines; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ParallelPipe.java b/common/model/src/main/java/nu/marginalia/util/ParallelPipe.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/util/ParallelPipe.java rename to common/model/src/main/java/nu/marginalia/util/ParallelPipe.java index 853af8fb..fc95debe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ParallelPipe.java +++ b/common/model/src/main/java/nu/marginalia/util/ParallelPipe.java @@ -9,6 +9,12 @@ import java.util.List; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; +/** Generalization of the workflow
+ * -- single provider thread reading sequentially from disk
+ * -> multiple independent CPU-bound processing tasks
+ * -> single consumer thread writing to network/disk
+ *

+ */ public abstract class ParallelPipe { private final LinkedBlockingQueue inputs; private final LinkedBlockingQueue intermediates; @@ -61,8 +67,9 @@ public abstract class ParallelPipe { } } - logger.debug("Terminating {}", Thread.currentThread().getName()); + logger.info("Terminating {}", Thread.currentThread().getName()); } + @SneakyThrows private void runReceiverThread() { while (expectingOutput || !inputs.isEmpty() || !intermediates.isEmpty()) { @@ -80,12 +87,16 @@ public abstract class ParallelPipe { logger.info("Terminating {}", Thread.currentThread().getName()); } + /** Begin processing an item */ @SneakyThrows public void accept(INPUT input) { inputs.put(input); } + /** The meat of the processor thread runtime */ protected abstract INTERMEDIATE onProcess(INPUT input) throws Exception; + + /** The meat of the consumer thread runtime */ protected abstract void onReceive(INTERMEDIATE intermediate) throws Exception; public void join() throws InterruptedException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java b/common/model/src/main/java/nu/marginalia/util/QueryParams.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java rename to common/model/src/main/java/nu/marginalia/util/QueryParams.java index 8b37e4c9..430758fd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java +++ b/common/model/src/main/java/nu/marginalia/util/QueryParams.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.util; import org.apache.commons.lang3.StringUtils; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java b/common/model/src/main/java/nu/marginalia/util/StringPool.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java rename to common/model/src/main/java/nu/marginalia/util/StringPool.java diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/common/model/src/main/resources/sql/edge-crawler-cache.sql similarity index 98% rename from marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql rename to common/model/src/main/resources/sql/edge-crawler-cache.sql index f0dc851c..3f93a4df 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/common/model/src/main/resources/sql/edge-crawler-cache.sql @@ -74,7 +74,7 @@ CREATE TABLE IF NOT EXISTS EC_PAGE_DATA ( FORMAT ENUM('PLAIN', 'UNKNOWN', 'HTML123', 'HTML4', 'XHTML', 'HTML5', 'MARKDOWN') NOT NULL, FEATURES INT COMMENT "Bit-encoded feature set of document, @see HtmlFeature" NOT NULL, - DATA_HASH INTEGER NOT NULL, + DATA_HASH BIGINT NOT NULL, QUALITY DOUBLE NOT NULL, PUB_YEAR SMALLINT, @@ -212,6 +212,13 @@ CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP); ---; +CREATE TABLE IF NOT EXISTS EC_RANDOM_DOMAINS ( + DOMAIN_ID INT PRIMARY KEY, + DOMAIN_SET INT NOT NULL +); + +---; + DROP TABLE IF EXISTS REF_DICTIONARY; CREATE TABLE IF NOT EXISTS REF_DICTIONARY ( @@ -287,3 +294,4 @@ CREATE TABLE WMSA_PROCESS( MUTEX VARCHAR(255), TIMEOUT INT NOT NULL DEFAULT 60 ); + diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeDomainTest.java b/common/model/src/test/java/nu/marginalia/model/EdgeDomainTest.java similarity index 98% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeDomainTest.java rename to common/model/src/test/java/nu/marginalia/model/EdgeDomainTest.java index 13686997..9fbf6890 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeDomainTest.java +++ b/common/model/src/test/java/nu/marginalia/model/EdgeDomainTest.java @@ -1,5 +1,6 @@ -package nu.marginalia.wmsa.edge.model; +package nu.marginalia.model; +import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Test; import java.net.URISyntaxException; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordMetadataTest.java b/common/model/src/test/java/nu/marginalia/model/EdgePageWordMetadataTest.java similarity index 93% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordMetadataTest.java rename to common/model/src/test/java/nu/marginalia/model/EdgePageWordMetadataTest.java index f3cbfa77..dcbc83e5 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordMetadataTest.java +++ b/common/model/src/test/java/nu/marginalia/model/EdgePageWordMetadataTest.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.model.crawl; +package nu.marginalia.model; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; +import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.EdgePageWordMetadata; import org.junit.jupiter.api.Test; import java.util.EnumSet; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java b/common/model/src/test/java/nu/marginalia/model/EdgeUrlTest.java similarity index 96% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java rename to common/model/src/test/java/nu/marginalia/model/EdgeUrlTest.java index 61444c69..d746ec3b 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java +++ b/common/model/src/test/java/nu/marginalia/model/EdgeUrlTest.java @@ -1,5 +1,6 @@ -package nu.marginalia.wmsa.edge.model; +package nu.marginalia.model; +import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Test; import java.net.URISyntaxException; diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/BrailleBlockPunchCardsTest.java b/common/model/src/test/java/nu/marginalia/util/BrailleBlockPunchCardsTest.java similarity index 100% rename from marginalia_nu/src/test/java/nu/marginalia/util/BrailleBlockPunchCardsTest.java rename to common/model/src/test/java/nu/marginalia/util/BrailleBlockPunchCardsTest.java diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java b/common/model/src/test/java/nu/marginalia/util/DenseBitMapTest.java similarity index 97% rename from marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java rename to common/model/src/test/java/nu/marginalia/util/DenseBitMapTest.java index 20857947..5f6d6aec 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java +++ b/common/model/src/test/java/nu/marginalia/util/DenseBitMapTest.java @@ -1,5 +1,6 @@ package nu.marginalia.util; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; diff --git a/common/model/src/test/java/nu/marginalia/util/LineUtilsTest.java b/common/model/src/test/java/nu/marginalia/util/LineUtilsTest.java new file mode 100644 index 00000000..e63ca38f --- /dev/null +++ b/common/model/src/test/java/nu/marginalia/util/LineUtilsTest.java @@ -0,0 +1,16 @@ +package nu.marginalia.util; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.List; + +class LineUtilsTest { + + @Test + void firstNLines() { + String text = "a\nb\r\ncd\r\re\n\rffgg\n\n"; + List expected = List.of("a", "b", "cd", "", "e", "ffgg", ""); + Assertions.assertEquals(expected, LineUtils.firstNLines(text, 10)); + } +} \ No newline at end of file diff --git a/common/readme.md b/common/readme.md new file mode 100644 index 00000000..3860c029 --- /dev/null +++ b/common/readme.md @@ -0,0 +1,9 @@ +# Common + +These are packages containing the basic building blocks for running a service as well +as shared models. + +* [config](config/) contains some `@Inject`ables. +* [service](service/) is the shared base classes for main methods and web services. +* [service-client](service-client/) is the shared base class for RPC. +* [service-discovery](service-discovery) contains tools that lets the services find each other. \ No newline at end of file diff --git a/common/service-client/build.gradle b/common/service-client/build.gradle new file mode 100644 index 00000000..916eeaeb --- /dev/null +++ b/common/service-client/build.gradle @@ -0,0 +1,56 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id "de.undercouch.download" version "5.1.0" + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:service-discovery') + + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.commons.lang3 + implementation libs.spark + implementation libs.guice + implementation libs.rxjava + + implementation libs.okhttp3 + implementation libs.bundles.httpcomponents + + implementation libs.bundles.gson + implementation libs.protobuf + + implementation libs.bundles.prometheus + + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform() +} + +task fastTests(type: Test) { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform { + excludeTags "slow" + } +} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbortingScheduler.java b/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbortingScheduler.java rename to common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java index fc06bd26..2ec196e6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbortingScheduler.java +++ b/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.client; +package nu.marginalia.client; import com.google.common.util.concurrent.ThreadFactoryBuilder; import io.reactivex.rxjava3.core.Scheduler; @@ -12,7 +12,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ThreadFactory; -public class AbortingScheduler implements AutoCloseable { +public class AbortingScheduler { private final String name; private final ThreadFactory threadFactory; @@ -54,7 +54,6 @@ public class AbortingScheduler implements AutoCloseable { return executorService; } - @Override public synchronized void close() { if (null != executorService) { executorService.shutdown(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java b/common/service-client/src/main/java/nu/marginalia/client/AbstractClient.java similarity index 64% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java rename to common/service-client/src/main/java/nu/marginalia/client/AbstractClient.java index eac18abf..5be58520 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java +++ b/common/service-client/src/main/java/nu/marginalia/client/AbstractClient.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.client; +package nu.marginalia.client; import com.google.gson.Gson; import com.google.protobuf.GeneratedMessageV3; @@ -6,11 +6,10 @@ import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.core.ObservableSource; import io.reactivex.rxjava3.plugins.RxJavaPlugins; import lombok.SneakyThrows; -import nu.marginalia.wmsa.client.exception.LocalException; -import nu.marginalia.wmsa.client.exception.NetworkException; -import nu.marginalia.wmsa.client.exception.RemoteException; -import nu.marginalia.wmsa.client.exception.RouteNotConfiguredException; -import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.client.exception.LocalException; +import nu.marginalia.client.exception.NetworkException; +import nu.marginalia.client.exception.RemoteException; +import nu.marginalia.client.exception.RouteNotConfiguredException; import okhttp3.*; import org.apache.http.HttpHost; import org.apache.logging.log4j.ThreadContext; @@ -24,32 +23,33 @@ import java.net.ConnectException; import java.util.Arrays; import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; import java.util.zip.GZIPOutputStream; public abstract class AbstractClient implements AutoCloseable { - public static final String CONTEXT_OUTBOUND_REQUEST = "outbound-request"; - - private final Gson gson = GsonFactory.get(); - private final Logger logger = LoggerFactory.getLogger(getClass()); + public static final String CONTEXT_OUTBOUND_REQUEST = "outbound-request"; + private final Gson gson; private final OkHttpClient client; private boolean quiet; - private String url; + private String serviceRoute; + private int timeout; + + + private volatile boolean alive; + private final Thread livenessMonitor; public void setTimeout(int timeout) { this.timeout = timeout; } - private int timeout; - private volatile boolean alive; - - private final Thread livenessMonitor; - - public AbstractClient(String host, int port, int timeout) { + public AbstractClient(String host, int port, int timeout, Supplier gsonProvider) { logger.info("Creating client for {}[{}:{}]", getClass().getSimpleName(), host, port); + this.gson = gsonProvider.get(); + this.timeout = timeout; client = new OkHttpClient.Builder() .connectTimeout(100, TimeUnit.MILLISECONDS) @@ -57,7 +57,7 @@ public abstract class AbstractClient implements AutoCloseable { .retryOnConnectionFailure(true) .followRedirects(true) .build(); - url = new HttpHost(host, port).toURI(); + serviceRoute = new HttpHost(host, port).toURI(); RxJavaPlugins.setErrorHandler(e -> { if (e.getMessage() == null) { @@ -76,28 +76,37 @@ public abstract class AbstractClient implements AutoCloseable { public void setServiceRoute(String hostname, int port) { scheduler().abort(); - url = new HttpHost(hostname, port).toURI(); + serviceRoute = new HttpHost(hostname, port).toURI(); + } + + protected String getServiceRoute() { + return serviceRoute; } @SneakyThrows private void monitorLiveness() { Thread.sleep(100); // Wait for initialization - for (;;) { - try { - alive = isResponsive(); - } - // - catch (Exception ex) { - logger.warn("Oops", ex); - } - synchronized (livenessMonitor) { - if (alive) { - livenessMonitor.wait(1000); + try { + for (; ; ) { + try { + alive = isResponsive(); + } + // + catch (Exception ex) { + logger.warn("Oops", ex); + } + synchronized (livenessMonitor) { + if (alive) { + livenessMonitor.wait(1000); + } + } + if (!alive) { + Thread.sleep(100); } } - if (!alive) { - Thread.sleep(100); - } + } + catch (InterruptedException ex) { + // nothing to see here } } @@ -117,11 +126,9 @@ public abstract class AbstractClient implements AutoCloseable { public synchronized boolean isResponsive() { Context ctx = Context.internal("ping"); - var req = ctx.paint(new Request.Builder()).url(url + "/internal/ping").get().build(); + var req = ctx.paint(new Request.Builder()).url(serviceRoute + "/internal/ping").get().build(); - var call = client.newCall(req); - - return Observable.just(call) + return Observable.just(client.newCall(req)) .subscribeOn(scheduler().get()) .map(Call::execute) .map(this::getResponseStatus) @@ -135,11 +142,9 @@ public abstract class AbstractClient implements AutoCloseable { public synchronized boolean isAccepting() { Context ctx = Context.internal("ready"); - var req = ctx.paint(new Request.Builder()).url(url + "/internal/ready").get().build(); + var req = ctx.paint(new Request.Builder()).url(serviceRoute + "/internal/ready").get().build(); - var call = client.newCall(req); - - return Observable.just(call) + return Observable.just(client.newCall(req)) .subscribeOn(scheduler().get()) .map(Call::execute) .map(this::getResponseStatus) @@ -156,19 +161,12 @@ public abstract class AbstractClient implements AutoCloseable { ensureAlive(); - RequestBody body = RequestBody.create( - MediaType.parse("application/json; charset=utf-8"), - json(data)); + RequestBody body = RequestBody.create(json(data), MediaType.parse("application/json; charset=utf-8")); - var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build(); - var call = client.newCall(req); + var req = ctx.paint(new Request.Builder()).url(serviceRoute + endpoint).post(body).build(); return Observable - .just(call) - .map((c) -> { - ThreadContext.put("outbound-request", url + endpoint); - return c; - }) + .just(client.newCall(req)) .subscribeOn(scheduler().get()) .map(this::logInbound) .map(Call::execute) @@ -186,15 +184,13 @@ public abstract class AbstractClient implements AutoCloseable { ensureAlive(); - RequestBody body = RequestBody.create( - MediaType.parse("application/protobuf"), - data.toByteArray()); + RequestBody body = RequestBody.create(data.toByteArray(), MediaType.parse("application/protobuf")); - var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build(); + var req = ctx.paint(new Request.Builder()).url(serviceRoute + endpoint).post(body).build(); var call = client.newCall(req); logInbound(call); - ThreadContext.put("outbound-request", url + endpoint); + ThreadContext.put("outbound-request", serviceRoute + endpoint); try (var rsp = call.execute()) { logOutbound(rsp); int code = rsp.code(); @@ -212,18 +208,10 @@ public abstract class AbstractClient implements AutoCloseable { ensureAlive(); - RequestBody body = RequestBody.create( - MediaType.parse("application/json"), - json(data)); + RequestBody body = RequestBody.create(json(data), MediaType.parse("application/json")); + var req = ctx.paint(new Request.Builder()).url(serviceRoute + endpoint).post(body).build(); - var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build(); - var call = client.newCall(req); - - return Observable.just(call) - .map((c) -> { - ThreadContext.put("outbound-request", url + endpoint); - return c; - }) + return Observable.just(client.newCall(req)) .subscribeOn(scheduler().get()) .map(this::logInbound) .map(Call::execute) @@ -238,15 +226,15 @@ public abstract class AbstractClient implements AutoCloseable { protected synchronized Observable post(Context ctx, String endpoint, String data, MediaType mediaType) { ensureAlive(); - var body = RequestBody.create(mediaType, data); + var body = RequestBody.create(data, mediaType); - var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build(); + var req = ctx.paint(new Request.Builder()).url(serviceRoute + endpoint).post(body).build(); var call = client.newCall(req); return Observable.just(call) .map((c) -> { - ThreadContext.put(CONTEXT_OUTBOUND_REQUEST, url + endpoint); + ThreadContext.put(CONTEXT_OUTBOUND_REQUEST, serviceRoute + endpoint); return c; }) .subscribeOn(scheduler().get()) @@ -264,14 +252,9 @@ public abstract class AbstractClient implements AutoCloseable { protected synchronized Observable get(Context ctx, String endpoint, Class type) { ensureAlive(); - var req = ctx.paint(new Request.Builder()).url(url + endpoint).get().build(); - var call = client.newCall(req); + var req = ctx.paint(new Request.Builder()).url(serviceRoute + endpoint).get().build(); - return Observable.just(call) - .map((c) -> { - ThreadContext.put("outbound-request", url + endpoint); - return c; - }) + return Observable.just(client.newCall(req)) .subscribeOn(scheduler().get()) .map(this::logInbound) .map(Call::execute) @@ -284,62 +267,12 @@ public abstract class AbstractClient implements AutoCloseable { } @SuppressWarnings("unchecked") - protected synchronized Observable> getList(Context ctx, String endpoint, Class type) { - ensureAlive(); - - var req = ctx.paint(new Request.Builder()).url(url + endpoint).get().build(); - var call = client.newCall(req); - - return Observable.just(call) - .map((c) -> { - ThreadContext.put("outbound-request", url + endpoint); - return c; - }) - .subscribeOn(scheduler().get()) - .map(this::logInbound) - .map(Call::execute) - .map(this::logOutbound) - .map(rsp -> validateResponseStatus(rsp, req, 200)) - .map(rsp -> Arrays.asList((T[])getEntity(rsp, type.arrayType()))) - .retryWhen(this::retryHandler) - .timeout(timeout, TimeUnit.SECONDS) - .doFinally(() -> ThreadContext.remove("outbound-request")); - } - - - protected synchronized Observable getBinary(Context ctx, String endpoint) { - ensureAlive(); - - var req = ctx.paint(new Request.Builder()).url(url + endpoint).get().build(); - var call = client.newCall(req); - - return Observable.just(call) - .map((c) -> { - ThreadContext.put("outbound-request", url + endpoint); - return c; - }) - .subscribeOn(scheduler().get()) - .map(this::logInbound) - .map(Call::execute) - .map(this::logOutbound) - .map(rsp -> validateResponseStatus(rsp, req, 200)) - .map(this::getBinaryEntity) - .retryWhen(this::retryHandler) - .timeout(timeout, TimeUnit.SECONDS) - .doFinally(() -> ThreadContext.remove("outbound-request")); - } - protected synchronized Observable get(Context ctx, String endpoint) { ensureAlive(); - var req = ctx.paint(new Request.Builder()).url(url + endpoint).get().build(); - var call = client.newCall(req); + var req = ctx.paint(new Request.Builder()).url(serviceRoute + endpoint).get().build(); - return Observable.just(call) - .map((c) -> { - ThreadContext.put("outbound-request", url + endpoint); - return c; - }) + return Observable.just(client.newCall(req)) .subscribeOn(scheduler().get()) .map(this::logInbound) .map(Call::execute) @@ -354,14 +287,9 @@ public abstract class AbstractClient implements AutoCloseable { protected synchronized Observable delete(Context ctx, String endpoint) { ensureAlive(); - var req = ctx.paint(new Request.Builder()).url(url + endpoint).delete().build(); - var call = client.newCall(req); + var req = ctx.paint(new Request.Builder()).url(serviceRoute + endpoint).delete().build(); - return Observable.just(call) - .map((c) -> { - ThreadContext.put("outbound-request", url + endpoint); - return c; - }) + return Observable.just(client.newCall(req)) .subscribeOn(scheduler().get()) .map(this::logInbound) .map(Call::execute) @@ -390,26 +318,12 @@ public abstract class AbstractClient implements AutoCloseable { if (!isAlive()) { wait(2000); if (!isAlive()) { - throw new RouteNotConfiguredException("Route not configured for " + name()); + throw new RouteNotConfiguredException("Route not configured for " + name() + " -- tried " + serviceRoute); } } } - @SneakyThrows - public void waitReady() { - boolean accepting = isAccepting(); - if (accepting) { - return; - } - - logger.info("Waiting for " + name()); - do { - Thread.sleep(1000); - } while (!isAccepting()); - } - - private ObservableSource retryHandler(Observable error) { return error.flatMap(this::filterRetryableExceptions); } @@ -488,12 +402,6 @@ public abstract class AbstractClient implements AutoCloseable { } - @SneakyThrows - private byte[] getBinaryEntity(Response response) { - try (response) { - return response.body().bytes(); - } - } public boolean isAlive() { return alive; } @@ -507,17 +415,4 @@ public abstract class AbstractClient implements AutoCloseable { } } - private byte[] compressedJson(Object o) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - GZIPOutputStream gos = new GZIPOutputStream(baos); - try { - gson.toJson(o, new OutputStreamWriter(gos)); - gos.finish(); - return baos.toByteArray(); - } - catch (Exception ex) { - throw new LocalException(ex); - } - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractDynamicClient.java b/common/service-client/src/main/java/nu/marginalia/client/AbstractDynamicClient.java similarity index 66% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractDynamicClient.java rename to common/service-client/src/main/java/nu/marginalia/client/AbstractDynamicClient.java index 974a939e..f85c2898 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractDynamicClient.java +++ b/common/service-client/src/main/java/nu/marginalia/client/AbstractDynamicClient.java @@ -1,21 +1,23 @@ -package nu.marginalia.wmsa.client; +package nu.marginalia.client; +import com.google.gson.Gson; import io.reactivex.rxjava3.core.Observable; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.service.descriptor.ServiceDescriptor; +import nu.marginalia.service.descriptor.HostsFile; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nonnull; +import java.util.function.Supplier; public class AbstractDynamicClient extends AbstractClient { private final ServiceDescriptor service; private final Logger logger = LoggerFactory.getLogger(getClass()); private final AbortingScheduler scheduler; - public AbstractDynamicClient(@Nonnull ServiceDescriptor service) { - super(service.getHost(), service.port, 10); + public AbstractDynamicClient(@Nonnull ServiceDescriptor service, HostsFile hosts, Supplier gsonProvider) { + super(hosts.getHost(service), service.port, 10, gsonProvider); this.service = service; this.scheduler = new AbortingScheduler(name()); @@ -32,7 +34,7 @@ public class AbstractDynamicClient extends AbstractClient { @SneakyThrows public void blockingWait() { - logger.info("Waiting for route to {}", service); + logger.info("Waiting for route to {} ({})", service, getServiceRoute()); while (!isAlive()) { Thread.sleep(1000); } @@ -43,10 +45,4 @@ public class AbstractDynamicClient extends AbstractClient { return scheduler; } - public Observable who(Context ctx) { - return get(ctx, "/public/who"); - } - public Observable ping(Context ctx) { - return get(ctx, "/internal/ping"); - } } diff --git a/common/service-client/src/main/java/nu/marginalia/client/Context.java b/common/service-client/src/main/java/nu/marginalia/client/Context.java new file mode 100644 index 00000000..9449d20f --- /dev/null +++ b/common/service-client/src/main/java/nu/marginalia/client/Context.java @@ -0,0 +1,89 @@ +package nu.marginalia.client; + +import io.reactivex.rxjava3.schedulers.Schedulers; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; + +import java.util.*; +import java.util.concurrent.TimeUnit; + +public class Context { + public static final String CONTEXT_HEADER = "X-Context"; + public static final String SESSION_HEADER = "Cookie"; + public static final String PUBLIC_HEADER = "X-Public"; + + private final String id; + private final String session; + private boolean treatAsPublic; + + private Context(String id, String session) { + this.id = Objects.requireNonNull(id, "Context missing"); + this.session = session; + } + + public Context treatAsPublic() { + this.treatAsPublic = true; + return this; + } + + public static Context internal() { + return new Context(UUID.randomUUID().toString(), null); + } + public static Context internal(String why) { + return new Context(why + ":" + System.nanoTime(), null); + } + + public static Context fromRequest(Request request) { + + if (Boolean.getBoolean("unit-test")) { + return Context.internal(); + } + + final var ctxHeader = anonymizeContext(request); + final var sessHeader = request.headers(SESSION_HEADER); + + return new Context(ctxHeader, sessHeader); + } + + private static String anonymizeContext(Request request) { + String header = request.headers(CONTEXT_HEADER); + if (header != null && header.contains("-")) { + // The public X-Context header contains info that traces to the + // external user's IP. Anonymize this by running it through a + // hash code blender with rotating salt + + return ContextScrambler.anonymize(header); + } + else if (header != null) { + return header; + } + else { + // When no X-Context is provided, synthesize one from path + return request.pathInfo() + ":" + Thread.currentThread().getId(); + } + } + + public okhttp3.Request.Builder paint(okhttp3.Request.Builder requestBuilder) { + requestBuilder.addHeader(CONTEXT_HEADER, id); + + if (session != null) { + requestBuilder.addHeader(SESSION_HEADER, session); + } + + if (treatAsPublic) { + requestBuilder.header(PUBLIC_HEADER, "1"); + } + + return requestBuilder; + } + + public String getContextId() { + return id; + } + + public boolean isPublic() { + return id.startsWith("#"); + } + +} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/HttpStatusCode.java b/common/service-client/src/main/java/nu/marginalia/client/HttpStatusCode.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/client/HttpStatusCode.java rename to common/service-client/src/main/java/nu/marginalia/client/HttpStatusCode.java index 3b39ae84..aa23e71d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/HttpStatusCode.java +++ b/common/service-client/src/main/java/nu/marginalia/client/HttpStatusCode.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.client; +package nu.marginalia.client; public final class HttpStatusCode { public final int code; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/LocalException.java b/common/service-client/src/main/java/nu/marginalia/client/exception/LocalException.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/LocalException.java rename to common/service-client/src/main/java/nu/marginalia/client/exception/LocalException.java index f721de69..bcaa2982 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/LocalException.java +++ b/common/service-client/src/main/java/nu/marginalia/client/exception/LocalException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.client.exception; +package nu.marginalia.client.exception; public class LocalException extends MessagingException { public LocalException() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/MessagingException.java b/common/service-client/src/main/java/nu/marginalia/client/exception/MessagingException.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/MessagingException.java rename to common/service-client/src/main/java/nu/marginalia/client/exception/MessagingException.java index f08b47b7..6151381e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/MessagingException.java +++ b/common/service-client/src/main/java/nu/marginalia/client/exception/MessagingException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.client.exception; +package nu.marginalia.client.exception; public class MessagingException extends RuntimeException { public MessagingException() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/NetworkException.java b/common/service-client/src/main/java/nu/marginalia/client/exception/NetworkException.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/NetworkException.java rename to common/service-client/src/main/java/nu/marginalia/client/exception/NetworkException.java index e39028fb..593333ad 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/NetworkException.java +++ b/common/service-client/src/main/java/nu/marginalia/client/exception/NetworkException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.client.exception; +package nu.marginalia.client.exception; public class NetworkException extends MessagingException { public NetworkException() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/RemoteException.java b/common/service-client/src/main/java/nu/marginalia/client/exception/RemoteException.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/RemoteException.java rename to common/service-client/src/main/java/nu/marginalia/client/exception/RemoteException.java index ed2c8645..d26df9b3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/RemoteException.java +++ b/common/service-client/src/main/java/nu/marginalia/client/exception/RemoteException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.client.exception; +package nu.marginalia.client.exception; public class RemoteException extends MessagingException { public RemoteException() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/RouteNotConfiguredException.java b/common/service-client/src/main/java/nu/marginalia/client/exception/RouteNotConfiguredException.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/RouteNotConfiguredException.java rename to common/service-client/src/main/java/nu/marginalia/client/exception/RouteNotConfiguredException.java index 7f2a4c40..c3155dcf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/RouteNotConfiguredException.java +++ b/common/service-client/src/main/java/nu/marginalia/client/exception/RouteNotConfiguredException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.client.exception; +package nu.marginalia.client.exception; public class RouteNotConfiguredException extends MessagingException { public RouteNotConfiguredException() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/TimeoutException.java b/common/service-client/src/main/java/nu/marginalia/client/exception/TimeoutException.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/TimeoutException.java rename to common/service-client/src/main/java/nu/marginalia/client/exception/TimeoutException.java index 35adc152..17632758 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/exception/TimeoutException.java +++ b/common/service-client/src/main/java/nu/marginalia/client/exception/TimeoutException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.client.exception; +package nu.marginalia.client.exception; public class TimeoutException extends MessagingException { public TimeoutException() { diff --git a/common/service-client/src/test/java/nu/marginalia/client/AbstractClientTest.java b/common/service-client/src/test/java/nu/marginalia/client/AbstractClientTest.java new file mode 100644 index 00000000..db590adf --- /dev/null +++ b/common/service-client/src/test/java/nu/marginalia/client/AbstractClientTest.java @@ -0,0 +1,180 @@ +package nu.marginalia.client; + +import com.google.gson.Gson; +import io.reactivex.rxjava3.core.Observable; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.SneakyThrows; +import org.junit.jupiter.api.*; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.*; + +public class AbstractClientTest { + + static TestServer testServer; + static AbstractClient client; + Gson gson = new Gson(); + + @Data @AllArgsConstructor + private static class DummyObject { + public int num; + public String str; + } + + @BeforeAll + public static void setUp() { + int port = new Random().nextInt(6000, 10000); + testServer = new TestServer(port); + + client = new AbstractClient("localhost", port, 1, Gson::new) { + @Override + public AbortingScheduler scheduler() { + return new AbortingScheduler(name()); + } + + @Override + public String name() { + return "test"; + } + }; + client.setTimeout(1); + } + + + @AfterAll + public static void tearDown() { + testServer.close(); + client.close(); + } + + private void assertError(Observable observable) { + try { + observable.blockingSubscribe(); + } + catch (RuntimeException ex) { + System.out.println("Got exception " + ex.getClass().getSimpleName() + " -- as expected!" ); + return; + } + Assertions.fail("Expected exception"); + } + @SneakyThrows + private Object timeout(Request request, Response response) { + Thread.sleep(5000); + return "yawn"; + } + @SneakyThrows + private Object error404(Request request, Response response) { + Spark.halt(404); + return ""; + } + + @Test + public void testGetTimeout() { + testServer.get(this::timeout); + + assertError(client.get(Context.internal(), "/get")); + } + + @Test + public void testPostTimeout() { + testServer.post(this::timeout); + + assertError(client.post(Context.internal(), "/post", "test")); + } + + @Test + public void testDeleteTimeout() { + testServer.delete(this::timeout); + + assertError(client.delete(Context.internal(), "/post")); + } + + @Test + public void testPost404() { + testServer.post(this::error404); + + assertError(client.post(Context.internal(), "/post", "test")); + } + @Test + public void testGet404() { + testServer.get(this::error404); + + assertError(client.get(Context.internal(), "/get")); + } + @Test + public void testDelete404() { + testServer.delete(this::error404); + + assertError(client.delete(Context.internal(), "/delete")); + } + + @Test + public void testGet() { + testServer.get((req, rsp) -> "Hello World"); + + assertEquals("Hello World", client.get(Context.internal(), "/get").blockingFirst()); + } + + @Test + public void testAcceptingUp() { + testServer.setReady(true); + assertTrue(client.isAccepting()); + } + + @Test + public void testAcceptingDown() { + testServer.setReady(false); + assertFalse(client.isAccepting()); + } + + @Test + public void testGetJson() { + testServer.get((req, rsp) -> new DummyObject(5, "23"), new Gson()::toJson); + + assertEquals(client.get(Context.internal(), "/get", DummyObject.class).blockingFirst(), + new DummyObject(5, "23")); + } + + + @Test + public void testDelete() { + testServer.delete((req, rsp) -> "Hello World"); + + assertTrue(client.delete(Context.internal(), "/delete").blockingFirst().isGood()); + } + + + @Test + public void testPost() { + List inbox = new ArrayList<>(); + testServer.post((req, rsp) -> { + inbox.add(gson.fromJson(req.body(), DummyObject.class)); + return "ok"; + }); + + client.post(Context.internal(), "/post", new DummyObject(5, "23")).blockingSubscribe(); + assertEquals(1, inbox.size()); + assertEquals(new DummyObject(5, "23"), inbox.get(0)); + } + + @Test + public void testPostGet() { + List inbox = new ArrayList<>(); + testServer.post((req, rsp) -> { + inbox.add(gson.fromJson(req.body(), DummyObject.class)); + return new DummyObject(1, "ret"); + }, gson::toJson); + + var ret = client.postGet(Context.internal(), "/post", new DummyObject(5, "23"), DummyObject.class).blockingFirst(); + assertEquals(1, inbox.size()); + assertEquals(new DummyObject(5, "23"), inbox.get(0)); + assertEquals(new DummyObject(1, "ret"), ret); + } +} diff --git a/common/service-client/src/test/java/nu/marginalia/client/TestServer.java b/common/service-client/src/test/java/nu/marginalia/client/TestServer.java new file mode 100644 index 00000000..cbfe1075 --- /dev/null +++ b/common/service-client/src/test/java/nu/marginalia/client/TestServer.java @@ -0,0 +1,59 @@ +package nu.marginalia.client; + +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.util.function.BiFunction; +import java.util.function.Function; + +public class TestServer { + BiFunction onGet; + BiFunction onPost; + BiFunction onDelete; + + + boolean isReady; + + public TestServer(int port) { + Spark.port(port); + Spark.get("/internal/ping", (r,q) -> "pong"); + Spark.get("/internal/ready", this::ready); + Spark.get("/get", (request, response) -> onGet.apply(request, response)); + Spark.post("/post", (request, response) -> onPost.apply(request, response)); + Spark.delete("/delete", (request, response) -> onDelete.apply(request, response)); + } + + private Object ready(Request request, Response response) { + if (isReady) { + return ""; + } + else { + response.status(401); + return "bad"; + } + } + + public void close() { + Spark.stop(); + } + + public boolean isReady() { + return isReady; + } + + public void setReady(boolean ready) { + isReady = ready; + } + + public TestServer get(BiFunction onGet) { this.onGet = onGet; return this; } + public TestServer get(BiFunction onGet, Function transform) { + this.onGet = onGet.andThen(transform); + return this; + } + public TestServer delete(BiFunction onDelete) { this.onDelete = onDelete; return this; } + public TestServer post(BiFunction onPost) { this.onPost = onPost; return this; } + public TestServer post(BiFunction onPost, Function transform) { + this.onPost = onPost.andThen(transform); return this; + } +} diff --git a/common/service-discovery/build.gradle b/common/service-discovery/build.gradle new file mode 100644 index 00000000..b870a937 --- /dev/null +++ b/common/service-discovery/build.gradle @@ -0,0 +1,22 @@ +plugins { + id 'java' + +} + +repositories { + mavenLocal() + mavenCentral() +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} +dependencies { + implementation libs.bundles.slf4j + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} diff --git a/common/service-discovery/src/main/java/nu/marginalia/service/SearchServiceDescriptors.java b/common/service-discovery/src/main/java/nu/marginalia/service/SearchServiceDescriptors.java new file mode 100644 index 00000000..a1f2bf13 --- /dev/null +++ b/common/service-discovery/src/main/java/nu/marginalia/service/SearchServiceDescriptors.java @@ -0,0 +1,17 @@ +package nu.marginalia.service; + +import nu.marginalia.service.descriptor.ServiceDescriptor; +import nu.marginalia.service.descriptor.ServiceDescriptors; +import nu.marginalia.service.id.ServiceId; + +import java.util.List; + +public class SearchServiceDescriptors { + public static ServiceDescriptors descriptors = new ServiceDescriptors( + List.of(new ServiceDescriptor(ServiceId.Api, 5004), + new ServiceDescriptor(ServiceId.Index, 5021), + new ServiceDescriptor(ServiceId.Search, 5023), + new ServiceDescriptor(ServiceId.Assistant, 5025), + new ServiceDescriptor(ServiceId.Dating, 5070), + new ServiceDescriptor(ServiceId.Explorer, 5071))); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/HostsFile.java b/common/service-discovery/src/main/java/nu/marginalia/service/descriptor/HostsFile.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/HostsFile.java rename to common/service-discovery/src/main/java/nu/marginalia/service/descriptor/HostsFile.java index 1a0341d7..ef46749b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/HostsFile.java +++ b/common/service-discovery/src/main/java/nu/marginalia/service/descriptor/HostsFile.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.configuration; +package nu.marginalia.service.descriptor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -13,7 +13,7 @@ import java.util.Map; * * */ public class HostsFile { - private final Map hostsMap = new HashMap<>(ServiceDescriptor.values().length); + private final Map hostsMap = new HashMap<>(); private static final Logger logger = LoggerFactory.getLogger(HostsFile.class); public HostsFile(Path fileName) throws IOException { var lines = Files.readAllLines(fileName); @@ -27,7 +27,7 @@ public class HostsFile { String hostName = parts[1]; try { - hostsMap.put(ServiceDescriptor.byName(descriptorName), hostName); + hostsMap.put(descriptorName, hostName); } catch (IllegalArgumentException ex) { logger.warn("Hosts file contains entry for unknown service {}", descriptorName); @@ -36,13 +36,10 @@ public class HostsFile { } public HostsFile() { - for (var sd : ServiceDescriptor.values()) { - hostsMap.put(sd, "localhost"); - } } public String getHost(ServiceDescriptor sd) { - return hostsMap.get(sd); + return hostsMap.getOrDefault(sd.name, sd.name); } } diff --git a/common/service-discovery/src/main/java/nu/marginalia/service/descriptor/ServiceDescriptor.java b/common/service-discovery/src/main/java/nu/marginalia/service/descriptor/ServiceDescriptor.java new file mode 100644 index 00000000..89f3fe22 --- /dev/null +++ b/common/service-discovery/src/main/java/nu/marginalia/service/descriptor/ServiceDescriptor.java @@ -0,0 +1,23 @@ +package nu.marginalia.service.descriptor; + +import nu.marginalia.service.id.ServiceId; + +public class ServiceDescriptor { + public final ServiceId id; + public final String name; + public final int port; + + public ServiceDescriptor(ServiceId id, int port) { + this.id = id; + this.name = id.name; + this.port = port; + } + + public String toString() { + return name; + } + + public String describeService() { + return String.format("%s", name); + } +} diff --git a/common/service-discovery/src/main/java/nu/marginalia/service/descriptor/ServiceDescriptors.java b/common/service-discovery/src/main/java/nu/marginalia/service/descriptor/ServiceDescriptors.java new file mode 100644 index 00000000..7fdbce0d --- /dev/null +++ b/common/service-discovery/src/main/java/nu/marginalia/service/descriptor/ServiceDescriptors.java @@ -0,0 +1,31 @@ +package nu.marginalia.service.descriptor; + +import nu.marginalia.service.id.ServiceId; + +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +public class ServiceDescriptors { + private final Map descriptorsAll = new LinkedHashMap<>(); + + public ServiceDescriptors() { + + } + + public ServiceDescriptors(List descriptors) { + descriptors.forEach(d -> descriptorsAll.put(d.id, d)); + } + + public ServiceDescriptor[] values() { + return descriptorsAll.values().toArray(ServiceDescriptor[]::new); + } + + public ServiceDescriptor forId(ServiceId id) { + return Objects.requireNonNull(descriptorsAll.get(id), + "No service descriptor defined for " + id + " -- did you forget to " + + "bind(ServiceDescriptors.class).toInstance(SearchServiceDescriptors.descriptors); ?"); + } + +} diff --git a/common/service-discovery/src/main/java/nu/marginalia/service/id/ServiceId.java b/common/service-discovery/src/main/java/nu/marginalia/service/id/ServiceId.java new file mode 100644 index 00000000..92ffb4a7 --- /dev/null +++ b/common/service-discovery/src/main/java/nu/marginalia/service/id/ServiceId.java @@ -0,0 +1,25 @@ +package nu.marginalia.service.id; + +public enum ServiceId { + + Assistant("assistant-service"), + Api("api-service"), + Search("search-service"), + Index("index-service"), + + Dating("dating-service"), + Explorer("explorer-service"), + + Other_Auth("auth"), + Other_Memex("memex"), + + + Other_ResourceStore("resource-store"), + Other_Renderer("renderer"), + Other_PodcastScraper("podcast-scraper"); + + public final String name; + ServiceId(String name) { + this.name = name; + } +} diff --git a/common/service/build.gradle b/common/service/build.gradle new file mode 100644 index 00000000..45f9a740 --- /dev/null +++ b/common/service/build.gradle @@ -0,0 +1,35 @@ +plugins { + id 'java' +} + + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':common:service-client') + implementation project(':common:service-discovery') + implementation project(':libraries:misc') + + implementation libs.lombok + annotationProcessor libs.lombok + + implementation libs.spark + implementation libs.guice + implementation libs.rxjava + + implementation libs.bundles.prometheus + implementation libs.bundles.slf4j + implementation libs.bucket4j + + testImplementation libs.bundles.slf4j.test + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} \ No newline at end of file diff --git a/common/service/src/main/java/nu/marginalia/service/MainClass.java b/common/service/src/main/java/nu/marginalia/service/MainClass.java new file mode 100644 index 00000000..26343581 --- /dev/null +++ b/common/service/src/main/java/nu/marginalia/service/MainClass.java @@ -0,0 +1,62 @@ +package nu.marginalia.service; + +import io.prometheus.client.hotspot.DefaultExports; +import io.reactivex.rxjava3.exceptions.UndeliverableException; +import io.reactivex.rxjava3.plugins.RxJavaPlugins; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.client.exception.NetworkException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.SocketTimeoutException; +import java.net.UnknownHostException; + +public abstract class MainClass { + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public MainClass() { + RxJavaPlugins.setErrorHandler(this::handleError); + } + + protected void handleError(Throwable ex) { + if (ex instanceof UndeliverableException) { + ex = ex.getCause(); + } + + if (ex instanceof SocketTimeoutException) { + logger.warn("SocketTimeoutException"); + } + else if (ex instanceof UnknownHostException) { + logger.warn("UnknownHostException"); + } + else if (ex instanceof NetworkException) { + logger.warn("NetworkException", ex); + } + else { + logger.error("Uncaught exception", ex); + } + } + + + protected static void init(ServiceId id, String... args) { + + System.setProperty("log4j2.isThreadContextMapInheritable", "true"); + System.setProperty("isThreadContextMapInheritable", "true"); + System.setProperty("service-name", id.name); + + initJdbc(); + initPrometheus(); + } + + private static void initJdbc() { + // This looks weird, but it's just for running the static block + // in the driver class so that it registers itself + + new org.mariadb.jdbc.Driver(); + } + + private static void initPrometheus() { + DefaultExports.initialize(); + } + +} diff --git a/common/service/src/main/java/nu/marginalia/service/module/ConfigurationModule.java b/common/service/src/main/java/nu/marginalia/service/module/ConfigurationModule.java new file mode 100644 index 00000000..a0d763d0 --- /dev/null +++ b/common/service/src/main/java/nu/marginalia/service/module/ConfigurationModule.java @@ -0,0 +1,35 @@ +package nu.marginalia.service.module; + +import com.google.inject.AbstractModule; +import com.google.inject.Provides; +import com.google.inject.name.Named; +import com.google.inject.name.Names; +import nu.marginalia.service.descriptor.ServiceDescriptors; +import nu.marginalia.service.id.ServiceId; + +import java.util.Objects; + +public class ConfigurationModule extends AbstractModule { + private static final String SERVICE_NAME = System.getProperty("service-name"); + private final ServiceDescriptors descriptors; + private final ServiceId id; + + public ConfigurationModule(ServiceDescriptors descriptors, ServiceId id) { + this.descriptors = descriptors; + this.id = id; + } + + public void configure() { + bind(ServiceDescriptors.class).toInstance(descriptors); + bind(String.class).annotatedWith(Names.named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME)); + bind(String.class).annotatedWith(Names.named("service-host")).toInstance(System.getProperty("service-host", "127.0.0.1")); + bind(Integer.class).annotatedWith(Names.named("service-port")).toInstance(descriptors.forId(id).port); + } + + @Provides + @Named("metrics-server-port") + public Integer provideMetricsServerPort(@Named("service-port") Integer servicePort) { + return servicePort + 1000; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/DatabaseModule.java b/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java similarity index 70% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/DatabaseModule.java rename to common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java index a61a4f4b..d189a708 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/DatabaseModule.java +++ b/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.configuration.module; +package nu.marginalia.service.module; import com.google.inject.AbstractModule; import com.google.inject.Provides; @@ -6,17 +6,15 @@ import com.google.inject.Singleton; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.WmsaHome; -import org.h2.tools.RunScript; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStreamReader; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Optional; import java.util.Properties; public class DatabaseModule extends AbstractModule { @@ -35,7 +33,7 @@ public class DatabaseModule extends AbstractModule { } private Properties loadDbProperties() { - Path propDir = WmsaHome.getHomePath().resolve("conf/db.properties"); + Path propDir = getHomePath().resolve("conf/db.properties"); if (!Files.isRegularFile(propDir)) { throw new IllegalStateException("Database properties file " + propDir + " does not exist"); } @@ -56,22 +54,27 @@ public class DatabaseModule extends AbstractModule { } + public static Path getHomePath() { + var retStr = Optional.ofNullable(System.getenv("WMSA_HOME")).orElse("/var/lib/wmsa"); + + var ret = Path.of(retStr); + if (!Files.isDirectory(ret)) { + throw new IllegalStateException("Could not find WMSA_HOME, either set environment variable or ensure /var/lib/wmsa exists"); + } + return ret; + } + + @SneakyThrows @Singleton @Provides public HikariDataSource provideConnection() { - if (Boolean.getBoolean("data-store-h2")) { - return getH2(); - } - else { - return getMariaDB(); - } - + return getMariaDB(); } @SneakyThrows private HikariDataSource getMariaDB() { - var connStr = dbProperties.getProperty(DB_CONN_KEY); + var connStr = System.getProperty("db.overrideJdbc", dbProperties.getProperty(DB_CONN_KEY)); try { HikariConfig config = new HikariConfig(); @@ -94,22 +97,4 @@ public class DatabaseModule extends AbstractModule { } } - - @SneakyThrows - private HikariDataSource getH2() { - HikariConfig config = new HikariConfig(); - config.setJdbcUrl("jdbc:h2:~/wmsa-db"); - config.setUsername("wmsa"); - config.setPassword(""); - - var ds = new HikariDataSource(config); - - try (var stream = ClassLoader.getSystemResourceAsStream("sql/data-store-init.sql")) { - RunScript.execute(ds.getConnection(), new InputStreamReader(stream)); - } - try (var stream = ClassLoader.getSystemResourceAsStream("sql/edge-crawler-cache.sql")) { - RunScript.execute(ds.getConnection(), new InputStreamReader(stream)); - } - return ds; - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/LoggerConfiguration.java b/common/service/src/main/java/nu/marginalia/service/module/LoggerConfiguration.java similarity index 83% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/LoggerConfiguration.java rename to common/service/src/main/java/nu/marginalia/service/module/LoggerConfiguration.java index 9557b433..8f651da4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/LoggerConfiguration.java +++ b/common/service/src/main/java/nu/marginalia/service/module/LoggerConfiguration.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.configuration.module; +package nu.marginalia.service.module; import com.google.inject.name.Named; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/MetricsPortProvider.java b/common/service/src/main/java/nu/marginalia/service/module/MetricsPortProvider.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/MetricsPortProvider.java rename to common/service/src/main/java/nu/marginalia/service/module/MetricsPortProvider.java index cca5bbfc..1f75fa9f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/MetricsPortProvider.java +++ b/common/service/src/main/java/nu/marginalia/service/module/MetricsPortProvider.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.configuration.module; +package nu.marginalia.service.module; import com.google.inject.name.Named; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Initialization.java b/common/service/src/main/java/nu/marginalia/service/server/Initialization.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Initialization.java rename to common/service/src/main/java/nu/marginalia/service/server/Initialization.java index 6b146672..c7a857ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Initialization.java +++ b/common/service/src/main/java/nu/marginalia/service/server/Initialization.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.configuration.server; +package nu.marginalia.service.server; import com.google.inject.Singleton; import lombok.SneakyThrows; @@ -22,11 +22,6 @@ public class Initialization { initialized = true; notifyAll(); } - - if (Boolean.getBoolean("go-no-go")) { - logger.info("Self-test OK"); - System.exit(0); - } } public boolean isReady() { @@ -41,7 +36,7 @@ public class Initialization { while (!initialized) { wait(); } - return initialized; + return true; } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/MetricsServer.java b/common/service/src/main/java/nu/marginalia/service/server/MetricsServer.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/MetricsServer.java rename to common/service/src/main/java/nu/marginalia/service/server/MetricsServer.java index c8da5e97..1822b465 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/MetricsServer.java +++ b/common/service/src/main/java/nu/marginalia/service/server/MetricsServer.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.configuration.server; +package nu.marginalia.service.server; import com.google.inject.Inject; import com.google.inject.name.Named; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/RateLimiter.java b/common/service/src/main/java/nu/marginalia/service/server/RateLimiter.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/RateLimiter.java rename to common/service/src/main/java/nu/marginalia/service/server/RateLimiter.java index 06a6131a..f9de1cb5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/RateLimiter.java +++ b/common/service/src/main/java/nu/marginalia/service/server/RateLimiter.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.configuration.server; +package nu.marginalia.service.server; import io.github.bucket4j.Bandwidth; import io.github.bucket4j.Bucket; -import io.github.bucket4j.Bucket4j; import io.github.bucket4j.Refill; import io.reactivex.rxjava3.schedulers.Schedulers; +import nu.marginalia.client.Context; import java.time.Duration; import java.util.Map; @@ -49,13 +49,11 @@ public class RateLimiter { } public boolean isAllowed(Context ctx) { - final Optional maybeIp = ctx.getIpHash(); - - if (maybeIp.isEmpty()) { // Internal server->server request + if (!ctx.isPublic()) { // Internal server->server request return true; } - return bucketMap.computeIfAbsent(maybeIp.get(), + return bucketMap.computeIfAbsent(ctx.getContextId(), (ip) -> createBucket()).tryConsume(1); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Service.java b/common/service/src/main/java/nu/marginalia/service/server/Service.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Service.java rename to common/service/src/main/java/nu/marginalia/service/server/Service.java index 9674611f..e5c04877 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Service.java +++ b/common/service/src/main/java/nu/marginalia/service/server/Service.java @@ -1,10 +1,9 @@ -package nu.marginalia.wmsa.configuration.server; +package nu.marginalia.service.server; import com.google.common.base.Strings; import io.prometheus.client.Counter; -import nu.marginalia.wmsa.client.exception.MessagingException; -import org.apache.http.HttpStatus; -import org.apache.logging.log4j.ThreadContext; +import nu.marginalia.client.Context; +import nu.marginalia.client.exception.MessagingException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Marker; @@ -17,6 +16,8 @@ import java.util.Optional; public class Service { private final Logger logger = LoggerFactory.getLogger(getClass()); + + // Marker for filtering out sensitive content from the persistent logs private final Marker httpMarker = MarkerFactory.getMarker("HTTP"); private final Initialization initialization; @@ -53,8 +54,8 @@ public class Service { configureStaticFiles.run(); - Spark.before(this::filterPublicRequests); Spark.before(this::auditRequestIn); + Spark.before(this::filterPublicRequests); Spark.after(this::auditRequestOut); Spark.exception(MessagingException.class, this::handleException); @@ -86,14 +87,14 @@ public class Service { if (!request.pathInfo().startsWith("/public/")) { logger.warn(httpMarker, "External connection to internal API: {} -> {} {}", context, request.requestMethod(), request.pathInfo()); - Spark.halt(HttpStatus.SC_FORBIDDEN); + Spark.halt(403); } String url = request.pathInfo(); if (request.queryString() != null) { url = url + "?" + request.queryString(); } - logger.info(httpMarker, "PUBLIC {}: {} {}", Context.fromRequest(request).getIpHash().orElse("?"), request.requestMethod(), url); + logger.info(httpMarker, "PUBLIC {}: {} {}", Context.fromRequest(request).getContextId(), request.requestMethod(), url); } private Object isInitialized(Request request, Response response) { @@ -101,7 +102,7 @@ public class Service { return "ok"; } else { - response.status(HttpStatus.SC_FAILED_DEPENDENCY); + response.status(424); return "bad"; } } @@ -115,21 +116,21 @@ public class Service { return "ok"; } else { - response.status(HttpStatus.SC_FAILED_DEPENDENCY); + response.status(424); return "bad"; } } private void auditRequestIn(Request request, Response response) { - request_counter.labels(serviceName).inc(); - // Paint context - if (!Strings.isNullOrEmpty(request.headers(Context.CONTEXT_HEADER))) { - Context.fromRequest(request); - } + paintThreadName(request, "req:"); + + request_counter.labels(serviceName).inc(); } + private void auditRequestOut(Request request, Response response) { - ThreadContext.clearMap(); + + paintThreadName(request, "rsp:"); if (response.status() < 400) { request_counter_good.labels(serviceName).inc(); @@ -143,6 +144,11 @@ public class Service { } } + private void paintThreadName(Request request, String prefix) { + var ctx = Context.fromRequest(request); + Thread.currentThread().setName(prefix + ctx.getContextId()); + } + private void handleException(Exception ex, Request request, Response response) { request_counter_err.labels(serviceName).inc(); if (ex instanceof MessagingException) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/StaticResources.java b/common/service/src/main/java/nu/marginalia/service/server/StaticResources.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/StaticResources.java rename to common/service/src/main/java/nu/marginalia/service/server/StaticResources.java index a3c2f756..332b9a55 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/StaticResources.java +++ b/common/service/src/main/java/nu/marginalia/service/server/StaticResources.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.resource_store; +package nu.marginalia.service.server; import lombok.SneakyThrows; import spark.Request; diff --git a/common/service/src/main/resources/log4j2.properties b/common/service/src/main/resources/log4j2.properties new file mode 100644 index 00000000..66d688b0 --- /dev/null +++ b/common/service/src/main/resources/log4j2.properties @@ -0,0 +1,28 @@ +log4j2.isThreadContextMapInheritable=true +status = info +appender.console.type = Console +appender.console.name = LogToConsole +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg{nolookups}%n +appender.rolling.type = RollingFile +appender.rolling.name = RollingFile +appender.rolling.fileName = /var/log/wmsa/wmsa-${sys:service-name}.log +appender.rolling.filePattern = /var/log/wmsa/wmsa-${sys:service-name}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz +appender.rolling.layout.pattern = %-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n +appender.rolling.layout.type = PatternLayout +appender.rolling.policies.type = Policies +appender.rolling.policies.size.type = SizeBasedTriggeringPolicy +appender.rolling.policies.size.size=10MB +appender.rolling.strategy.type = DefaultRolloverStrategy +appender.rolling.strategy.max = 10 +appender.rolling.filter.query.type = MarkerFilter +appender.rolling.filter.query.onMismatch=ACCEPT +appender.rolling.filter.query.onMatch=DENY +appender.rolling.filter.query.marker=QUERY +appender.rolling.filter.http.type = MarkerFilter +appender.rolling.filter.http.onMismatch=ACCEPT +appender.rolling.filter.http.onMatch=DENY +appender.rolling.filter.http.marker=HTTP +rootLogger.level = info +rootLogger.appenderRef.console.ref = LogToConsole +rootLogger.appenderRef.rolling.ref = RollingFile \ No newline at end of file diff --git a/crawl/common/build.gradle b/crawl/common/build.gradle new file mode 100644 index 00000000..90b18eab --- /dev/null +++ b/crawl/common/build.gradle @@ -0,0 +1,55 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:config') + implementation project(':libraries:misc') + implementation project(':crawl:crawling-model') + + implementation libs.notnull + implementation libs.lombok + annotationProcessor libs.lombok + + implementation libs.bundles.gson + implementation libs.rxjava + implementation libs.bundles.slf4j + testImplementation libs.bundles.slf4j.test + + implementation libs.guava + implementation libs.guice + + implementation libs.snakeyaml + implementation libs.jsoup + implementation libs.zstd + + implementation libs.commons.net + + implementation libs.opencsv + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/AbortMonitor.java b/crawl/common/src/main/java/nu/marginalia/crawling/common/AbortMonitor.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/AbortMonitor.java rename to crawl/common/src/main/java/nu/marginalia/crawling/common/AbortMonitor.java index e76ed65d..c23ab5db 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/AbortMonitor.java +++ b/crawl/common/src/main/java/nu/marginalia/crawling/common/AbortMonitor.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling.common; import lombok.SneakyThrows; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/TaskStats.java b/crawl/common/src/main/java/nu/marginalia/crawling/common/TaskStats.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/TaskStats.java rename to crawl/common/src/main/java/nu/marginalia/crawling/common/TaskStats.java index 7c0384bb..4a75bcd4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/TaskStats.java +++ b/crawl/common/src/main/java/nu/marginalia/crawling/common/TaskStats.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting; +package nu.marginalia.crawling.common; public class TaskStats { private final long[] taskTimes; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/WorkLog.java b/crawl/common/src/main/java/nu/marginalia/crawling/common/WorkLog.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/WorkLog.java rename to crawl/common/src/main/java/nu/marginalia/crawling/common/WorkLog.java index fb5bf5b2..092ca66a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/WorkLog.java +++ b/crawl/common/src/main/java/nu/marginalia/crawling/common/WorkLog.java @@ -1,9 +1,10 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling.common; import com.google.errorprone.annotations.MustBeClosed; -import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry; +import nu.marginalia.crawling.model.CrawlLogEntry; import org.apache.logging.log4j.util.Strings; +import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -27,9 +28,9 @@ public class WorkLog implements AutoCloseable { writeLogEntry("# Starting WorkLog @ " + LocalDateTime.now()); } - public static void readLog(Path logFile, Consumer entryConsumer) { + public static void readLog(Path logFile, Consumer entryConsumer) throws FileNotFoundException { if (!Files.exists(logFile)) { - return; + throw new FileNotFoundException("Log file not found " + logFile); } try (var entries = streamLog(logFile)) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/GeoIpBlocklist.java b/crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/GeoIpBlocklist.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/GeoIpBlocklist.java rename to crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/GeoIpBlocklist.java index eca3b73e..7fe3e9cc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/GeoIpBlocklist.java +++ b/crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/GeoIpBlocklist.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.crawling.blocklist; +package nu.marginalia.crawling.common.blocklist; import com.google.inject.Singleton; import com.opencsv.CSVReader; import com.opencsv.exceptions.CsvValidationException; import lombok.AllArgsConstructor; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.WmsaHome; +import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/InetAddressCache.java b/crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/InetAddressCache.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/InetAddressCache.java rename to crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/InetAddressCache.java index 0bef701d..b1722c42 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/InetAddressCache.java +++ b/crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/InetAddressCache.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.crawling.blocklist; +package nu.marginalia.crawling.common.blocklist; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.model.EdgeDomain; import java.net.InetAddress; import java.util.concurrent.ExecutionException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/IpBlockList.java b/crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/IpBlockList.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/IpBlockList.java rename to crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/IpBlockList.java index 27c8f6dd..c5a42137 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/IpBlockList.java +++ b/crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/IpBlockList.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.crawling.blocklist; +package nu.marginalia.crawling.common.blocklist; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.model.EdgeDomain; import org.apache.commons.net.util.SubnetUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/UrlBlocklist.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java rename to crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/UrlBlocklist.java index 1a00e161..b849b246 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java +++ b/crawl/common/src/main/java/nu/marginalia/crawling/common/blocklist/UrlBlocklist.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.crawling.blocklist; +package nu.marginalia.crawling.common.blocklist; -import nu.marginalia.util.gregex.GuardedRegexFactory; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.gregex.GuardedRegexFactory; import java.util.ArrayList; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/crawl/common/src/main/java/nu/marginalia/crawling/common/link/LinkParser.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java rename to crawl/common/src/main/java/nu/marginalia/crawling/common/link/LinkParser.java index 45611e08..024a47b3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/crawl/common/src/main/java/nu/marginalia/crawling/common/link/LinkParser.java @@ -1,9 +1,10 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.crawling.common.link; import com.google.common.base.CharMatcher; import com.google.common.base.Strings; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.util.QueryParams; import org.jetbrains.annotations.Contract; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlPlanLoader.java b/crawl/common/src/main/java/nu/marginalia/crawling/common/plan/CrawlPlanLoader.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlPlanLoader.java rename to crawl/common/src/main/java/nu/marginalia/crawling/common/plan/CrawlPlanLoader.java index f4060aff..239ed7ce 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlPlanLoader.java +++ b/crawl/common/src/main/java/nu/marginalia/crawling/common/plan/CrawlPlanLoader.java @@ -1,6 +1,5 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling.common.plan; -import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; import org.yaml.snakeyaml.Yaml; import java.io.FileReader; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerSpecificationLoader.java b/crawl/common/src/main/java/nu/marginalia/crawling/common/plan/CrawlerSpecificationLoader.java similarity index 60% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerSpecificationLoader.java rename to crawl/common/src/main/java/nu/marginalia/crawling/common/plan/CrawlerSpecificationLoader.java index 59ad4155..619a52eb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerSpecificationLoader.java +++ b/crawl/common/src/main/java/nu/marginalia/crawling/common/plan/CrawlerSpecificationLoader.java @@ -1,9 +1,12 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling.common.plan; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; +import com.google.gson.JsonStreamParser; +import com.google.gson.stream.JsonReader; +import nu.marginalia.crawling.common.AbortMonitor; +import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.model.gson.GsonFactory; import org.apache.logging.log4j.util.Strings; import java.io.BufferedReader; @@ -18,15 +21,9 @@ public class CrawlerSpecificationLoader { public static void readInputSpec(Path inputSpec, Consumer consumer) { try (var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile()))))) { - - for (;;) { - var line = inputStream.readLine(); - if (line == null || !AbortMonitor.getInstance().isAlive()) - break; - - if (Strings.isNotBlank(line)) { - consumer.accept(gson.fromJson(line, CrawlingSpecification.class)); - } + var parser = new JsonStreamParser(inputStream); + while (parser.hasNext()) { + consumer.accept(gson.fromJson(parser.next(), CrawlingSpecification.class)); } } catch (IOException e) { e.printStackTrace(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java b/crawl/common/src/main/java/nu/marginalia/crawling/common/plan/EdgeCrawlPlan.java similarity index 78% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java rename to crawl/common/src/main/java/nu/marginalia/crawling/common/plan/EdgeCrawlPlan.java index 037c12f3..4b025a40 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java +++ b/crawl/common/src/main/java/nu/marginalia/crawling/common/plan/EdgeCrawlPlan.java @@ -1,17 +1,19 @@ -package nu.marginalia.wmsa.edge.model; +package nu.marginalia.crawling.common.plan; import com.google.errorprone.annotations.MustBeClosed; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.ToString; -import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; -import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader; -import nu.marginalia.wmsa.edge.crawling.WorkLog; -import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; -import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; +import nu.marginalia.crawling.common.WorkLog; +import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.model.CrawlLogEntry; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.CrawlingSpecification; import org.jetbrains.annotations.NotNull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.FileNotFoundException; import java.io.IOException; import java.nio.file.Path; import java.util.Iterator; @@ -21,10 +23,13 @@ import java.util.stream.Stream; @AllArgsConstructor @NoArgsConstructor @ToString public class EdgeCrawlPlan { + private final Logger logger = LoggerFactory.getLogger(getClass()); public String jobSpec; public WorkDir crawl; public WorkDir process; + private static String rootDirRewrite = System.getProperty("crawl.rootDirRewrite"); + public Path getJobSpec() { return Path.of(jobSpec); } @@ -35,13 +40,22 @@ public class EdgeCrawlPlan { public String logName; public Path getDir() { - return Path.of(dir); + return Path.of(rewrite(dir)); } public Path getLogFile() { - return Path.of(dir).resolve(logName); + return Path.of(rewrite(dir)).resolve(logName); } } + private static String rewrite(String dir) { + if (rootDirRewrite == null) { + return dir; + } + String[] parts = rootDirRewrite.split(":"); + + return dir.replace(parts[0], parts[1]); + } + public Path getCrawledFilePath(String fileName) { String sp1 = fileName.substring(0, 2); String sp2 = fileName.substring(2, 4); @@ -66,10 +80,10 @@ public class EdgeCrawlPlan { CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer); } - public void forEachCrawlingLogEntry(Consumer consumer) { + public void forEachCrawlingLogEntry(Consumer consumer) throws FileNotFoundException { WorkLog.readLog(this.crawl.getLogFile(), consumer); } - public void forEachProcessingLogEntry(Consumer consumer) { + public void forEachProcessingLogEntry(Consumer consumer) throws FileNotFoundException { WorkLog.readLog(this.process.getLogFile(), consumer); } @@ -84,6 +98,8 @@ public class EdgeCrawlPlan { .forEach(consumer); } catch (IOException ex) { + logger.warn("Failed to read domains", ex); + throw new RuntimeException(ex); } } @@ -99,6 +115,8 @@ public class EdgeCrawlPlan { .forEach(consumer); } catch (IOException ex) { + logger.warn("Failed to read domains", ex); + throw new RuntimeException(ex); } } diff --git a/marginalia_nu/src/e2e/resources/log4j2.properties b/crawl/common/src/main/resources/log4j2.properties similarity index 57% rename from marginalia_nu/src/e2e/resources/log4j2.properties rename to crawl/common/src/main/resources/log4j2.properties index 9c2dbefd..f6768d3e 100644 --- a/marginalia_nu/src/e2e/resources/log4j2.properties +++ b/crawl/common/src/main/resources/log4j2.properties @@ -1,15 +1,10 @@ - +log4j2.isThreadContextMapInheritable=true status = info - appender.console.type = Console appender.console.name = LogToConsole appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg%n - -logger.console.name = nu.marginalia -logger.console.level = debug -logger.console.additivity = false -logger.console.appenderRef.rolling.ref = LogToConsole - +appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n +appender.console.filter.http.type = MarkerFilter rootLogger.level = info rootLogger.appenderRef.console.ref = LogToConsole +#rootLogger.appenderRef.http.ref = LogHttpTraffic \ No newline at end of file diff --git a/crawl/converting-model/build.gradle b/crawl/converting-model/build.gradle new file mode 100644 index 00000000..4b323894 --- /dev/null +++ b/crawl/converting-model/build.gradle @@ -0,0 +1,41 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':api:index-api') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + implementation project(':libraries:language-processing') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.notnull + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Instruction.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java similarity index 68% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Instruction.java rename to crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java index 7f40edf6..4964c9b1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Instruction.java +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.interpreter; +package nu.marginalia.converting.instruction; public interface Instruction { void apply(Interpreter interpreter); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/InstructionTag.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/InstructionTag.java rename to crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java index 398ad430..9b03794b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/InstructionTag.java +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.converting.interpreter; +package nu.marginalia.converting.instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.*; +import nu.marginalia.converting.instruction.instructions.*; public enum InstructionTag { diff --git a/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java new file mode 100644 index 00000000..e4efa9b9 --- /dev/null +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java @@ -0,0 +1,25 @@ +package nu.marginalia.converting.instruction; + +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.model.crawl.DocumentKeywords; +import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; + +public interface Interpreter { + void loadUrl(EdgeUrl[] url); + void loadDomain(EdgeDomain[] domain); + void loadRssFeed(EdgeUrl[] rssFeed); + void loadDomainLink(DomainLink[] links); + + void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip); + void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); + void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); + + void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words); + + void loadDomainRedirect(DomainLink link); +} diff --git a/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java new file mode 100644 index 00000000..c33f9892 --- /dev/null +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java @@ -0,0 +1,6 @@ +package nu.marginalia.converting.instruction.instructions; + +import nu.marginalia.model.EdgeDomain; + +public record DomainLink(EdgeDomain from, EdgeDomain to) { +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadDomain.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomain.java similarity index 61% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadDomain.java rename to crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomain.java index 7cf88b06..f1f361a1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadDomain.java +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomain.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.InstructionTag; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.model.EdgeDomain; import java.util.Arrays; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadDomainLink.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainLink.java similarity index 65% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadDomainLink.java rename to crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainLink.java index 2d302ddf..9a5b85f8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadDomainLink.java +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainLink.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.InstructionTag; +import nu.marginalia.converting.instruction.Interpreter; import java.util.Arrays; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadDomainRedirect.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainRedirect.java similarity index 63% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadDomainRedirect.java rename to crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainRedirect.java index 452d990e..5bd357ab 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadDomainRedirect.java +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainRedirect.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.InstructionTag; +import nu.marginalia.converting.instruction.Interpreter; public record LoadDomainRedirect(DomainLink links) implements Instruction { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadKeywords.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java similarity index 57% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadKeywords.java rename to crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java index 106f02b7..9ff24a5a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadKeywords.java +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java @@ -1,10 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.model.crawl.DocumentKeywords; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.InstructionTag; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.model.EdgeUrl; public record LoadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) implements Instruction { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocument.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java similarity index 64% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocument.java rename to crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java index 3f65f7af..6c56a100 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocument.java +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java @@ -1,13 +1,12 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; -import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; - -import javax.annotation.Nullable; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.InstructionTag; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.model.EdgeUrl; +import org.jetbrains.annotations.Nullable; public record LoadProcessedDocument(EdgeUrl url, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocumentWithError.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java similarity index 57% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocumentWithError.java rename to crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java index 8d37cb64..b798ac49 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocumentWithError.java +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; +import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.InstructionTag; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.model.EdgeUrl; public record LoadProcessedDocumentWithError(EdgeUrl url, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDomain.java similarity index 52% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java rename to crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDomain.java index 2b1fd631..b7784a2b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDomain.java +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDomain.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.InstructionTag; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.model.EdgeDomain; public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadRssFeed.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadRssFeed.java similarity index 61% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadRssFeed.java rename to crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadRssFeed.java index d4dbe0eb..f6c8d7b5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadRssFeed.java +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadRssFeed.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.InstructionTag; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.model.EdgeUrl; import java.util.Arrays; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadUrl.java b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadUrl.java similarity index 60% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadUrl.java rename to crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadUrl.java index 50c2b34c..d126a515 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadUrl.java +++ b/crawl/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadUrl.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; +package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.InstructionTag; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.model.EdgeUrl; import java.util.Arrays; diff --git a/crawl/converting-process/build.gradle b/crawl/converting-process/build.gradle new file mode 100644 index 00000000..5c016842 --- /dev/null +++ b/crawl/converting-process/build.gradle @@ -0,0 +1,70 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'application' + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +application { + mainClass = 'nu.marginalia.converting.ConverterMain' + applicationName = 'converter-process' +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:service') + implementation project(':common:config') + implementation project(':libraries:misc') + implementation project(':api:index-api') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + implementation project(':libraries:language-processing') + implementation project(':crawl:common') + implementation project(':crawl:converting-model') + implementation project(':crawl:crawling-model') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.notnull + + implementation libs.jsoup + + implementation libs.guice + implementation libs.gson + + implementation libs.zstd + + implementation libs.bundles.mariadb + + implementation libs.trove + implementation libs.fastutil + + implementation libs.crawlercommons + + implementation libs.commons.lang3 + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + maxHeapSize = "8G" + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java index b05ec3ad..564798f8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java @@ -1,15 +1,15 @@ -package nu.marginalia.wmsa.edge.converting; +package nu.marginalia.converting; import com.github.luben.zstd.ZstdOutputStream; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.model.crawl.DocumentKeywords; +import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; import java.io.BufferedOutputStream; import java.io.IOException; @@ -21,9 +21,6 @@ import java.time.LocalDateTime; import java.time.ZoneOffset; public class ConversionLog implements AutoCloseable, Interpreter { - - - private final PrintWriter writer; public ConversionLog(Path rootDir) throws IOException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java similarity index 51% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 8b941d92..1c3d9776 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -1,17 +1,17 @@ -package nu.marginalia.wmsa.edge.converting; +package nu.marginalia.converting; import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.crawling.common.WorkLog; +import nu.marginalia.crawling.common.plan.CrawlPlanLoader; +import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; +import nu.marginalia.converting.compiler.InstructionsCompiler; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.processor.DomainProcessor; +import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.util.ParallelPipe; -import nu.marginalia.wmsa.edge.converting.compiler.InstructionsCompiler; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; -import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import nu.marginalia.wmsa.edge.crawling.WorkLog; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; -import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -22,7 +22,7 @@ import java.util.List; public class ConverterMain { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final LoadInstructionWriter instructionWriter; + private final InstructionWriter instructionWriter; public static void main(String... args) throws IOException { @@ -50,24 +50,36 @@ public class ConverterMain { try (WorkLog processLog = plan.createProcessWorkLog(); ConversionLog log = new ConversionLog(plan.process.getDir())) { - instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson); - var pipe = new ParallelPipe("Crawler", 16, 4, 2) { + instructionWriter = new InstructionWriter(log, plan.process.getDir(), gson); + var pipe = new ParallelPipe("Converter", 16, 4, 2) { @Override protected ProcessingInstructions onProcess(CrawledDomain domainData) { - var processed = processor.process(domainData); - var compiled = compiler.compile(processed); + Thread.currentThread().setName("Converter:Processor["+domainData.domain+"] - " + domainData.size()); + try { + var processed = processor.process(domainData); + var compiled = compiler.compile(processed); - return new ProcessingInstructions(domainData.id, compiled); + return new ProcessingInstructions(domainData.id, compiled); + } + finally { + Thread.currentThread().setName("Converter:Processor[IDLE]"); + } } @Override protected void onReceive(ProcessingInstructions processedInstructions) throws IOException { - var instructions = processedInstructions.instructions; - instructions.removeIf(Instruction::isNoOp); + Thread.currentThread().setName("Converter:Receiver["+processedInstructions.id+"]"); + try { + var instructions = processedInstructions.instructions; + instructions.removeIf(Instruction::isNoOp); - String where = instructionWriter.accept(processedInstructions.id, instructions); - processLog.setJobToFinished(processedInstructions.id, where, instructions.size()); + String where = instructionWriter.accept(processedInstructions.id, instructions); + processLog.setJobToFinished(processedInstructions.id, where, instructions.size()); + } + finally { + Thread.currentThread().setName("Converter:Receiver[IDLE]"); + } } }; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java similarity index 52% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java index 745452be..b7f95683 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java @@ -1,17 +1,12 @@ -package nu.marginalia.wmsa.edge.converting; +package nu.marginalia.converting; import com.google.gson.Gson; import com.google.inject.AbstractModule; import com.google.inject.name.Names; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexLocalService; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient; -import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; - -import java.nio.file.Path; +import nu.marginalia.LanguageModels; +import nu.marginalia.WmsaHome; +import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; +import nu.marginalia.model.gson.GsonFactory; public class ConverterModule extends AbstractModule { @@ -31,15 +26,6 @@ public class ConverterModule extends AbstractModule { bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); - if (null != System.getProperty("local-index-path")) { - bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path"))); - bind(EdgeIndexWriterClient.class).to(EdgeIndexLocalService.class); - } - else { - bind(EdgeIndexWriterClient.class).to(EdgeIndexClient.class); - } - - bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java similarity index 79% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java index b21fa138..b9a1e8cc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java @@ -1,17 +1,17 @@ -package nu.marginalia.wmsa.edge.converting; +package nu.marginalia.converting; import com.github.luben.zstd.ZstdOutputStream; import com.google.gson.Gson; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.model.crawl.DocumentKeywords; +import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -23,14 +23,14 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.List; -public class LoadInstructionWriter { +public class InstructionWriter { private ConversionLog log; private final Path outputDir; private final Gson gson; - private static final Logger logger = LoggerFactory.getLogger(LoadInstructionWriter.class); + private static final Logger logger = LoggerFactory.getLogger(InstructionWriter.class); - public LoadInstructionWriter(ConversionLog log, Path outputDir, Gson gson) { + public InstructionWriter(ConversionLog log, Path outputDir, Gson gson) { this.log = log; this.outputDir = outputDir; this.gson = gson; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/UpdateDomainStatistics.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/UpdateDomainStatistics.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/UpdateDomainStatistics.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/UpdateDomainStatistics.java index 428bd902..a59c7426 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/UpdateDomainStatistics.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/UpdateDomainStatistics.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting; +package nu.marginalia.converting; import com.zaxxer.hikari.HikariDataSource; import gnu.trove.map.hash.TIntIntHashMap; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.service.module.DatabaseModule; import java.sql.SQLException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java similarity index 64% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java index 9f35a557..923e48f5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.edge.converting.compiler; +package nu.marginalia.converting.compiler; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadKeywords; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.model.crawl.DocumentKeywords; +import nu.marginalia.converting.instruction.instructions.LoadKeywords; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.model.crawl.HtmlFeature; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/FeedsCompiler.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java similarity index 59% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/FeedsCompiler.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java index e3774288..64779a0f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/FeedsCompiler.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.converting.compiler; +package nu.marginalia.converting.compiler; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadRssFeed; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.instructions.LoadRssFeed; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.model.EdgeUrl; import java.util.List; import java.util.Objects; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/InstructionsCompiler.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/InstructionsCompiler.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java index 1b3614a1..a2242961 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/InstructionsCompiler.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.converting.compiler; +package nu.marginalia.converting.compiler; import com.google.inject.Inject; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDomain; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDomain; +import nu.marginalia.converting.model.ProcessedDomain; import java.util.ArrayList; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/LinksCompiler.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java similarity index 59% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/LinksCompiler.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java index cb115821..a578602d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/LinksCompiler.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.converting.compiler; +package nu.marginalia.converting.compiler; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.converting.instruction.instructions.LoadDomainLink; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.model.EdgeDomain; import java.util.List; import java.util.Objects; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java new file mode 100644 index 00000000..b14dedca --- /dev/null +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java @@ -0,0 +1,19 @@ +package nu.marginalia.converting.compiler; + +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.converting.instruction.instructions.LoadDomain; +import nu.marginalia.converting.instruction.instructions.LoadDomainLink; +import nu.marginalia.converting.instruction.instructions.LoadDomainRedirect; +import nu.marginalia.model.EdgeDomain; + +import java.util.List; + +public class RedirectCompiler { + + public void compile(List ret, EdgeDomain from, EdgeDomain to) { + ret.add(new LoadDomain(to)); + ret.add(new LoadDomainLink(new DomainLink(from, to))); + ret.add(new LoadDomainRedirect(new DomainLink(from, to))); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/UrlsCompiler.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/UrlsCompiler.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java index b847aa21..4d05a35d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/UrlsCompiler.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.compiler; +package nu.marginalia.converting.compiler; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadUrl; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.instructions.LoadDomain; +import nu.marginalia.converting.instruction.instructions.LoadUrl; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; import java.util.ArrayList; import java.util.HashSet; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/DisqualifiedException.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/model/DisqualifiedException.java index 3c97622a..e73a2d12 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/DisqualifiedException.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.converting.model; +package nu.marginalia.converting.model; -import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.crawling.model.CrawlerDocumentStatus; public class DisqualifiedException extends Exception { public final DisqualificationReason reason; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java similarity index 52% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java index df3367cc..da589c44 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java @@ -1,9 +1,10 @@ -package nu.marginalia.wmsa.edge.converting.model; +package nu.marginalia.converting.model; import lombok.ToString; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; -import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; +import nu.marginalia.model.crawl.EdgePageDocumentFlags; +import nu.marginalia.model.crawl.EdgePageWords; +import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.model.EdgeUrl; import java.util.OptionalDouble; @@ -17,10 +18,22 @@ public class ProcessedDocument { public EdgeUrlState state; public String stateReason; + public long lshHash; + public boolean isOk() { return EdgeUrlState.OK == state; } + public boolean isProcessedFully() { + if (!isOk()) + return false; + + if (details == null) + return false; + + return !details.metadata.hasFlag(EdgePageDocumentFlags.Simple); + } + public OptionalDouble quality() { if (details != null) { return OptionalDouble.of(details.quality); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java similarity index 65% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java index 29b2ecc3..bd120aac 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.converting.model; +package nu.marginalia.converting.model; import lombok.ToString; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.model.EdgeUrl; import javax.annotation.Nullable; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDomain.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java similarity index 82% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDomain.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java index 101d1fb8..f3a08b98 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDomain.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.model; +package nu.marginalia.converting.model; import lombok.ToString; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; import java.util.List; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/AcceptableAds.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/AcceptableAds.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/AcceptableAds.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/AcceptableAds.java index 2814eea7..d097c60a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/AcceptableAds.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/AcceptableAds.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.converting.processor; +package nu.marginalia.converting.processor; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDocument; import org.jsoup.nodes.Document; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java new file mode 100644 index 00000000..b7072236 --- /dev/null +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -0,0 +1,136 @@ +package nu.marginalia.converting.processor; + +import com.google.inject.Inject; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.converting.model.DisqualifiedException; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin; +import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; +import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin; +import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URISyntaxException; +import java.util.*; + +public class DocumentProcessor { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private static final Set acceptedContentTypes = Set.of("application/xhtml+xml", + "application/xhtml", + "text/html", + "text/plain"); + + + private final List processorPlugins = new ArrayList<>(); + + @Inject + public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin, + PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin) + { + + processorPlugins.add(htmlDocumentProcessorPlugin); + processorPlugins.add(plainTextDocumentProcessorPlugin); + } + + public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) { + ProcessedDocument ret = new ProcessedDocument(); + + try { + processDocument(crawledDocument, crawledDomain, ret); + } + catch (DisqualifiedException ex) { + ret.state = EdgeUrlState.DISQUALIFIED; + ret.stateReason = ex.reason.toString(); + logger.debug("Disqualified {}: {}", ret.url, ex.reason); + } + catch (Exception ex) { + ret.state = EdgeUrlState.DISQUALIFIED; + ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString(); + logger.info("Failed to convert " + crawledDocument.url, ex); + ex.printStackTrace(); + } + + return ret; + } + + private void processDocument(CrawledDocument crawledDocument, CrawledDomain crawledDomain, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { + + var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); + if (crawlerStatus != CrawlerDocumentStatus.OK) { + throw new DisqualifiedException(crawlerStatus); + } + + if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) { + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ACCEPTABLE_ADS); + } + + if (!isAcceptedContentType(crawledDocument)) { + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.CONTENT_TYPE); + } + + + ret.url = getDocumentUrl(crawledDocument); + ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus); + + final var plugin = findPlugin(crawledDocument); + + AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDomain, crawledDocument); + + ret.details = detailsWithWords.details(); + ret.words = detailsWithWords.words(); + } + + private AbstractDocumentProcessorPlugin findPlugin(CrawledDocument crawledDocument) throws DisqualifiedException { + for (var plugin : processorPlugins) { + if (plugin.isApplicable(crawledDocument)) + return plugin; + } + + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.CONTENT_TYPE); + } + + + private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument) + throws URISyntaxException + { + if (crawledDocument.canonicalUrl != null) { + try { + return new EdgeUrl(crawledDocument.canonicalUrl); + } + catch (URISyntaxException ex) { /* fallthrough */ } + } + + return new EdgeUrl(crawledDocument.url); + } + + public static boolean isAcceptedContentType(CrawledDocument crawledDocument) { + if (crawledDocument.contentType == null) { + return false; + } + + var ct = crawledDocument.contentType; + + if (acceptedContentTypes.contains(ct)) + return true; + + if (ct.contains(";")) { + return acceptedContentTypes.contains(ct.substring(0, ct.indexOf(';'))); + } + return false; + } + + private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) { + return switch (CrawlerDocumentStatus.valueOf(crawlerStatus)) { + case OK -> httpStatus < 300 ? EdgeUrlState.OK : EdgeUrlState.DEAD; + case REDIRECT -> EdgeUrlState.REDIRECT; + default -> EdgeUrlState.DEAD; + }; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java similarity index 83% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index e5ed00e5..005a1efc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -1,29 +1,33 @@ -package nu.marginalia.wmsa.edge.converting.processor; +package nu.marginalia.converting.processor; import com.google.common.base.Strings; import com.google.inject.Inject; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.crawling.model.CrawlerDomainStatus; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; import nu.marginalia.util.StringPool; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; -import nu.marginalia.wmsa.edge.converting.processor.logic.InternalLinkGraph; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; -import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.converting.processor.logic.InternalLinkGraph; +import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import java.util.*; -import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL; - public class DomainProcessor { private final DocumentProcessor documentProcessor; private final SiteWords siteWords; + private final LshDocumentDeduplicator documentDeduplicator; + @Inject public DomainProcessor(DocumentProcessor documentProcessor, - SiteWords siteWords) { + SiteWords siteWords, + LshDocumentDeduplicator documentDeduplicator) { this.documentProcessor = documentProcessor; this.siteWords = siteWords; + this.documentDeduplicator = documentDeduplicator; } public ProcessedDomain process(CrawledDomain crawledDomain) { @@ -62,6 +66,8 @@ public class DomainProcessor { stringPool.flush(); + documentDeduplicator.deduplicate(ret.documents); + InternalLinkGraph internalLinkGraph = new InternalLinkGraph(); ret.documents.forEach(internalLinkGraph::accept); @@ -84,6 +90,7 @@ public class DomainProcessor { return ret; } + private void fixBadCanonicalTags(List docs) { Map> seenCanonicals = new HashMap<>(); Set seenUrls = new HashSet<>(); @@ -108,7 +115,7 @@ public class DomainProcessor { document.canonicalUrl = document.url; } else { - document.crawlerStatus = BAD_CANONICAL.name(); + document.crawlerStatus = CrawlerDocumentStatus.BAD_CANONICAL.name(); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/SiteWords.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/SiteWords.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java index 87e8c931..2fd75cba 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/SiteWords.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.edge.converting.processor; +package nu.marginalia.converting.processor; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; -import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor; -import nu.marginalia.wmsa.edge.converting.processor.logic.InternalLinkGraph; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.converting.processor.logic.CommonKeywordExtractor; +import nu.marginalia.converting.processor.logic.InternalLinkGraph; import javax.inject.Singleton; import java.util.HashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/DocumentKeywordExtractor.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/DocumentKeywordExtractor.java index 557e8d58..a3d225d8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/DocumentKeywordExtractor.java @@ -1,12 +1,14 @@ -package nu.marginalia.util.language.processing; +package nu.marginalia.converting.processor.keywords; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.util.language.processing.model.KeywordMetadata; -import nu.marginalia.util.language.processing.model.WordRep; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; +import nu.marginalia.language.WordPatterns; +import nu.marginalia.language.encoding.AsciiFlattener; +import nu.marginalia.language.keywords.KeywordExtractor; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.KeywordMetadata; +import nu.marginalia.language.model.WordRep; +import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.crawl.EdgePageWords; +import nu.marginalia.language.statistics.TermFrequencyDict; import javax.inject.Inject; import java.util.*; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/KeywordCounter.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/KeywordCounter.java index 2ee90f6b..c153be0b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/KeywordCounter.java @@ -1,12 +1,14 @@ -package nu.marginalia.util.language.processing; +package nu.marginalia.converting.processor.keywords; -import com.github.jknack.handlebars.internal.lang3.StringUtils; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.util.language.processing.model.KeywordMetadata; -import nu.marginalia.util.language.processing.model.WordRep; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.language.WordPatterns; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.KeywordMetadata; +import nu.marginalia.language.model.WordFrequencyData; +import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.keywords.KeywordExtractor; +import nu.marginalia.language.statistics.TermFrequencyDict; +import org.apache.commons.lang3.StringUtils; import java.util.ArrayList; import java.util.HashMap; @@ -105,5 +107,4 @@ public class KeywordCounter { return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount); } - public record WordFrequencyData(int count, int tfIdfNormalized) { } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/NameCounter.java similarity index 83% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/NameCounter.java index 221790d6..22ce88ed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/NameCounter.java @@ -1,8 +1,9 @@ -package nu.marginalia.util.language.processing; +package nu.marginalia.converting.processor.keywords; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.util.language.processing.model.DocumentSentence; -import nu.marginalia.util.language.processing.model.WordRep; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.keywords.KeywordExtractor; import java.util.*; import java.util.stream.Collectors; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/SubjectCounter.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/SubjectCounter.java index b0f46f30..e99cbb5c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/SubjectCounter.java @@ -1,10 +1,11 @@ -package nu.marginalia.util.language.processing; +package nu.marginalia.converting.processor.keywords; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.util.language.processing.model.KeywordMetadata; -import nu.marginalia.util.language.processing.model.WordRep; -import nu.marginalia.util.language.processing.model.WordSpan; -import nu.marginalia.util.language.processing.model.tag.WordSeparator; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.KeywordMetadata; +import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.model.WordSpan; +import nu.marginalia.language.model.WordSeparator; +import nu.marginalia.language.keywords.KeywordExtractor; import org.apache.commons.lang3.StringUtils; import java.util.*; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/CommonKeywordExtractor.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/CommonKeywordExtractor.java index cabb6454..eb7d39e7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/CommonKeywordExtractor.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; import ca.rmen.porterstemmer.PorterStemmer; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.converting.model.ProcessedDomain; import java.util.*; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 46fc7925..36d4f3b0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -1,16 +1,14 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; import crawlercommons.utils.Strings; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.converting.model.DisqualifiedException; import org.jsoup.nodes.Document; import java.util.Set; -import static nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason.LENGTH; - public class DocumentValuator { private static final Set filthTable = Set.of( @@ -32,7 +30,7 @@ public class DocumentValuator { int rawLength = crawledDocument.documentBody.length(); if (textBodyLength == 0) { - throw new DisqualifiedException(LENGTH); + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH); } return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilter.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilter.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java index 1e68125f..f8c0b65d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilter.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 2681b1c6..c0e1bc91 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -1,10 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.*; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.converting.processor.logic.topic.*; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeedExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeedExtractor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java index b82b77ac..c20a9878 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeedExtractor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java @@ -1,6 +1,7 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.crawling.common.link.LinkParser; +import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlStandardExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java similarity index 70% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlStandardExtractor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java index 3eeb4bce..5179274c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlStandardExtractor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java @@ -1,14 +1,12 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; import com.google.common.base.Strings; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; import org.jsoup.nodes.Document; import org.jsoup.nodes.DocumentType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.*; - public class HtmlStandardExtractor { @@ -16,48 +14,48 @@ public class HtmlStandardExtractor { public static EdgeHtmlStandard parseDocType(DocumentType docType) { if (null == docType) { - return UNKNOWN; + return EdgeHtmlStandard.UNKNOWN; } String publicId = docType.publicId(); if (Strings.isNullOrEmpty(publicId)) - return HTML5; + return EdgeHtmlStandard.HTML5; publicId = publicId.toUpperCase(); if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) { - return HTML4; + return EdgeHtmlStandard.HTML4; } if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) { - return HTML123; + return EdgeHtmlStandard.HTML123; } if (publicId.startsWith("-//INTERNET/RFC XXXX//EN")) - return HTML123; + return EdgeHtmlStandard.HTML123; if (publicId.startsWith("-//NETSCAPE COMM. CORP")) - return HTML123; + return EdgeHtmlStandard.HTML123; if (publicId.startsWith("-//SQ//DTD HTML 2")) - return HTML123; + return EdgeHtmlStandard.HTML123; if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2")) - return HTML123; + return EdgeHtmlStandard.HTML123; if (publicId.startsWith("-//W3O//DTD W3 HTML 2")) - return HTML123; + return EdgeHtmlStandard.HTML123; if (publicId.startsWith("-//IETF//DTD HTML 2")) - return HTML123; + return EdgeHtmlStandard.HTML123; if (publicId.startsWith("-//IETF//DTD HTML//EN")) - return HTML123; + return EdgeHtmlStandard.HTML123; if (publicId.startsWith("-/W3C//DTD HTML 3")) - return HTML123; + return EdgeHtmlStandard.HTML123; if (publicId.startsWith("-/W3C/DTD HTML 3")) - return HTML123; + return EdgeHtmlStandard.HTML123; if (publicId.startsWith("-//IETF//DTD HTML 3")) - return HTML123; + return EdgeHtmlStandard.HTML123; if (publicId.startsWith("-//W3C//DTD XHTML")) - return XHTML; + return EdgeHtmlStandard.XHTML; if (publicId.startsWith("ISO/IEC 15445:2000//DTD")) - return XHTML; + return EdgeHtmlStandard.XHTML; if (publicId.startsWith("-//W3C//DTD HTML")) - return HTML4; + return EdgeHtmlStandard.HTML4; logger.debug("Unknown publicID standard {}", publicId); - return UNKNOWN; + return EdgeHtmlStandard.UNKNOWN; } public static EdgeHtmlStandard sniffHtmlStandard(Document parsed) { @@ -74,11 +72,11 @@ public class HtmlStandardExtractor { html4Attributes++; } if (html5Attributes > 0) { - return HTML5; + return EdgeHtmlStandard.HTML5; } if (html4Attributes > 0) { - return HTML4; + return EdgeHtmlStandard.HTML4; } - return HTML123; + return EdgeHtmlStandard.HTML123; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/InternalLinkGraph.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/InternalLinkGraph.java index 6b0cd10f..1f69cf31 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/InternalLinkGraph.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.model.EdgeUrl; import java.util.*; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LinkProcessor.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LinkProcessor.java index b94f90d8..68b212de 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LinkProcessor.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails; -import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.converting.model.ProcessedDocumentDetails; +import nu.marginalia.crawling.common.blocklist.UrlBlocklist; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; import java.util.ArrayList; import java.util.HashSet; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java new file mode 100644 index 00000000..877c22d3 --- /dev/null +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java @@ -0,0 +1,60 @@ +package nu.marginalia.converting.processor.logic; + +import com.google.inject.Singleton; +import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.lsh.EasyLSH; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +@Singleton +public class LshDocumentDeduplicator { + + private final int DISTANCE_THRESHOLD = 4; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public void deduplicate(List documents) { + Set goodDocuments = documents.stream() + .filter(ProcessedDocument::isProcessedFully) + .collect(Collectors.toSet()); + + for (var document : documents) { + if (!goodDocuments.contains(document)) { + continue; + } + + goodDocuments.removeIf(other -> removeIfDuplicate(document, other)); + } + } + + private boolean removeIfDuplicate(ProcessedDocument thisDoc, ProcessedDocument otherDoc) { + if (thisDoc == otherDoc) + return false; + + if (thisDoc.words.size() < 100 + || otherDoc.words.size() < 100) { + return false; + } + + if (EasyLSH.hammingDistance(thisDoc.lshHash, otherDoc.lshHash) < DISTANCE_THRESHOLD) + return false; + + if (thisDoc.url.path.length() + < otherDoc.url.path.length()) + { + logger.info("{} duplicates {}", otherDoc.url, thisDoc.url); + + otherDoc.state = EdgeUrlState.DISQUALIFIED; + otherDoc.stateReason = "Duplicate"; + + return true; + } + + return false; + + } +} diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/PlainTextLogic.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/PlainTextLogic.java new file mode 100644 index 00000000..aefa5710 --- /dev/null +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/PlainTextLogic.java @@ -0,0 +1,115 @@ +package nu.marginalia.converting.processor.logic; + +import nu.marginalia.model.EdgeUrl; +import org.apache.commons.lang3.StringUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +public class PlainTextLogic { + + public String getDescription(List firstFewLines) { + return StringUtils.truncate(firstFewLines.stream().filter(this::looksLikeText) + .collect(Collectors.joining(" ")).replaceAll("\\s+", " ") + + , 255); + } + + private boolean looksLikeText(String s) { + s = s.trim(); + if (s.length() < 16) + return false; + return 4 * s.chars().filter(Character::isAlphabetic).count() > 3L * s.length(); + + } + + public String getTitle(EdgeUrl url, List firstFewLines) { + List candidates = new ArrayList<>(firstFewLines); + + // Remove mailing list header type stuff + candidates.removeIf(line -> line.contains(":")); + + for (int line = 1; line < candidates.size(); line++) { + String maybeUnderline = candidates.get(line); + if (isUnderline(maybeUnderline)) { + String titleCandidate = candidates.get(line - 1).trim(); + if (titleCandidate.length() > 16) { + return StringUtils.truncate(titleCandidate, 128); + } + } + } + + for (var line : firstFewLines) { + if (isSideline(line)) { + return line.replaceAll("[^a-zA-Z0-9]+", " ").trim(); + } + } + + return url.path.substring(url.path.lastIndexOf('/')); + } + + public boolean isSideline(String s) { + + // detector for + // ==== HEADER ==== + // -style headings + + int start, end; + for (start = 0; start < s.length(); start++) { + if (!Character.isWhitespace(s.charAt(start))) break; + } + for (end = s.length() - 1; end > start; end--) { + if (!Character.isWhitespace(s.charAt(start))) break; + } + + if (end - start < 8) + return false; + + int c = s.charAt(start); + if ("=_*".indexOf(c) < 0) { + return false; + } + if (c != s.charAt(end)) { + return false; + } + + for (; start < end && s.charAt(start) == c; start++); + + if (end - start < 4) + return false; + + for (; end > start && s.charAt(end) == c; --end); + + if (end - start < 4) + return false; + + return true; + } + public boolean isUnderline(String s) { + int start, end; + for (start = 0; start < s.length(); start++) { + if (!Character.isWhitespace(s.charAt(start))) break; + } + for (end = s.length() - 1; end > start; end--) { + if (!Character.isWhitespace(s.charAt(start))) break; + } + if (end - start < 8) + return false; + + if ("=_*".indexOf(s.charAt(start)) < 0) { + return false; + } + + int c = s.charAt(start); + + for (int i = start; i < end; i++) { + if (c != s.charAt(i)) + return false; + } + + return true; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SalientImageDetector.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SalientImageDetector.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SalientImageDetector.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SalientImageDetector.java index 271ad6f2..4a3baabd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SalientImageDetector.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SalientImageDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractionFilter.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractionFilter.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractionFilter.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractionFilter.java index adafa835..942aec70 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractionFilter.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractionFilter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; import com.google.common.base.Strings; import org.apache.commons.lang3.StringUtils; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractor.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractor.java index 4984125f..ce37df0a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractor.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; import com.google.inject.Inject; import com.google.inject.name.Named; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/TitleExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/TitleExtractor.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/TitleExtractor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/TitleExtractor.java index 2ce4212e..4a3293d7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/TitleExtractor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/TitleExtractor.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.processor.logic; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.language.model.DocumentLanguageData; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateEffortLevel.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateEffortLevel.java new file mode 100644 index 00000000..a69373bb --- /dev/null +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateEffortLevel.java @@ -0,0 +1,6 @@ +package nu.marginalia.converting.processor.logic.pubdate; + +public enum PubDateEffortLevel { + LOW, + HIGH +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateHeuristic.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateHeuristic.java similarity index 56% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateHeuristic.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateHeuristic.java index 0bac7705..e2e67258 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateHeuristic.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateHeuristic.java @@ -1,7 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; +package nu.marginalia.converting.processor.logic.pubdate; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateParser.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateParser.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateParser.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateParser.java index 8e49fda8..131a5f3d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateParser.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateParser.java @@ -1,6 +1,7 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; +package nu.marginalia.converting.processor.logic.pubdate; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; import java.time.DateTimeException; import java.time.LocalDate; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateSniffer.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateSniffer.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateSniffer.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateSniffer.java index 25a5ece1..7eeca0d3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateSniffer.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/PubDateSniffer.java @@ -1,8 +1,9 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; +package nu.marginalia.converting.processor.logic.pubdate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.*; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.heuristic.*; +import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; import java.util.ArrayList; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java index cc85ab2a..8d32d965 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java index 264f9eb1..4692153d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; import org.jsoup.nodes.Node; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java new file mode 100644 index 00000000..da44e3fa --- /dev/null +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java @@ -0,0 +1,23 @@ +package nu.marginalia.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + if (htmlStandard == EdgeHtmlStandard.UNKNOWN) + return Optional.empty(); + + return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard))); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java similarity index 60% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java index d20ed246..ca0220ae 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java similarity index 54% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java index 78c54b9a..c63e15b1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java similarity index 54% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java index 8dec0f6a..5b2b7034 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java similarity index 70% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java index 2187a744..aedb0611 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java @@ -1,14 +1,14 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.gson.JsonSyntaxException; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java similarity index 57% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java index 5a47c9df..f7ed3af9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java similarity index 53% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java index a257bba2..75de4a71 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java similarity index 54% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java index bd9b66a9..6ddd78d8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java similarity index 53% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java index 2618cdef..59f8e08d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java similarity index 70% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java index 70b19ad0..2756c089 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java similarity index 67% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java index 19aceecd..6432d9c3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; +package nu.marginalia.converting.processor.logic.pubdate.heuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.converting.processor.logic.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/AdblockSimulator.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/AdblockSimulator.java index 199e05bc..62c4b778 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/AdblockSimulator.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.topic; +package nu.marginalia.converting.processor.logic.topic; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.WmsaHome; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/GoogleAnwersSpamDetector.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/GoogleAnwersSpamDetector.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/GoogleAnwersSpamDetector.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/GoogleAnwersSpamDetector.java index 75e7fea3..dc0c4eed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/GoogleAnwersSpamDetector.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/GoogleAnwersSpamDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.topic; +package nu.marginalia.converting.processor.logic.topic; import org.jsoup.nodes.Document; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/RecipeDetector.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/RecipeDetector.java index 74122799..29dea927 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/RecipeDetector.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.topic; +package nu.marginalia.converting.processor.logic.topic; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.language.model.DocumentLanguageData; import java.util.HashMap; import java.util.Map; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/TextileCraftDetector.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/TextileCraftDetector.java index 1df3b8ee..771d1491 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/TextileCraftDetector.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.topic; +package nu.marginalia.converting.processor.logic.topic; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.language.model.DocumentLanguageData; import java.util.HashMap; import java.util.Map; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/WoodworkingDetector.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/WoodworkingDetector.java index e58320f6..fd9be203 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/topic/WoodworkingDetector.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.topic; +package nu.marginalia.converting.processor.logic.topic; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.language.model.DocumentLanguageData; import java.util.HashMap; import java.util.Map; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java new file mode 100644 index 00000000..89c8afb6 --- /dev/null +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -0,0 +1,88 @@ +package nu.marginalia.converting.processor.plugin; + +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.language.LanguageFilter; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgePageWords; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.model.DisqualifiedException; +import nu.marginalia.converting.model.ProcessedDocumentDetails; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.EdgeUrl; + +import java.net.URISyntaxException; +import java.util.*; + +public abstract class AbstractDocumentProcessorPlugin { + protected LanguageFilter languageFilter = new LanguageFilter(); + + public abstract DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException; + public abstract boolean isApplicable(CrawledDocument doc); + + protected void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { + double languageAgreement = languageFilter.dictionaryAgreement(dld); + if (languageAgreement < 0.1) { + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE); + } + } + + protected static class MetaTagsBuilder { + private final Set tagWords = new HashSet<>(); + + public void build(EdgePageWords dest) { + dest.addAllSyntheticTerms(tagWords); + } + + public MetaTagsBuilder addDomainCrawlData(CrawledDomain domain) { + if (domain.ip != null) { + tagWords.add("ip:" + domain.ip.toLowerCase()); // lower case because IPv6 is hexadecimal + } + return this; + } + + public MetaTagsBuilder addUrl(EdgeUrl url) { + tagWords.add("proto:"+url.proto.toLowerCase()); + + var edgeDomain = url.domain; + + tagWords.add("site:" + edgeDomain.toString().toLowerCase()); + if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) { + tagWords.add("site:" + edgeDomain.domain.toLowerCase()); + } + + tagWords.add("tld:" + edgeDomain.getTld()); + return this; + } + + public MetaTagsBuilder addFormat(EdgeHtmlStandard standard) { + tagWords.add("format:"+standard.toString().toLowerCase()); + return this; + } + + public MetaTagsBuilder addFeatures(Set features) { + features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); + + tagWords.add("js:" + Boolean.toString(features.contains(HtmlFeature.JS)).toLowerCase()); + + return this; + } + public MetaTagsBuilder addPubDate(PubDate pubDate) { + + if (pubDate.year() > 1900) { + tagWords.add("year:" + pubDate.year()); + } + if (pubDate.dateIso8601() != null) { + tagWords.add("pub:" + pubDate.dateIso8601()); + } + + return this; + } + + } + + + public record DetailsWithWords(ProcessedDocumentDetails details, + EdgePageWords words) {} +} diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java new file mode 100644 index 00000000..5ea3c932 --- /dev/null +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -0,0 +1,270 @@ +package nu.marginalia.converting.processor.plugin; + +import com.google.inject.Inject; +import com.google.inject.name.Named; +import nu.marginalia.crawling.common.link.LinkParser; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.language.model.KeywordMetadata; +import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgePageDocumentFlags; +import nu.marginalia.model.crawl.EdgePageWords; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.converting.processor.logic.*; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.logic.pubdate.PubDateSniffer; +import nu.marginalia.gregex.GuardedRegex; +import nu.marginalia.gregex.GuardedRegexFactory; +import nu.marginalia.converting.model.DisqualifiedException; +import nu.marginalia.converting.model.ProcessedDocumentDetails; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.Set; + + +public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin { + + private final int minDocumentLength; + private final double minDocumentQuality; + + private final SentenceExtractor sentenceExtractor; + private final FeatureExtractor featureExtractor; + private final TitleExtractor titleExtractor; + private final DocumentKeywordExtractor keywordExtractor; + private final SummaryExtractor summaryExtractor; + private final PubDateSniffer pubDateSniffer; + + private static final DocumentValuator documentValuator = new DocumentValuator(); + + private static final LinkParser linkParser = new LinkParser(); + private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser); + + @Inject + public HtmlDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength, + @Named("min-document-quality") Double minDocumentQuality, + SentenceExtractor sentenceExtractor, + FeatureExtractor featureExtractor, + TitleExtractor titleExtractor, + DocumentKeywordExtractor keywordExtractor, + SummaryExtractor summaryExtractor, + PubDateSniffer pubDateSniffer) { + this.minDocumentLength = minDocumentLength; + this.minDocumentQuality = minDocumentQuality; + this.sentenceExtractor = sentenceExtractor; + this.featureExtractor = featureExtractor; + + this.titleExtractor = titleExtractor; + this.keywordExtractor = keywordExtractor; + this.summaryExtractor = summaryExtractor; + this.pubDateSniffer = pubDateSniffer; + } + + @Override + public boolean isApplicable(CrawledDocument doc) { + return doc.contentType.toLowerCase().contains("html"); + } + + @Override + public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) + throws DisqualifiedException, URISyntaxException { + + String documentBody = crawledDocument.documentBody.decode(); + + if (languageFilter.isBlockedUnicodeRange(documentBody)) { + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE); + } + + Document doc = Jsoup.parse(documentBody); + + if (doc.select("meta[name=robots]").attr("content").contains("noindex")) { + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.FORBIDDEN); + } + + final EdgeUrl url = new EdgeUrl(crawledDocument.url); + + Document prunedDoc = doc.clone(); + + prunedDoc.getElementsByTag("svg").remove(); + prunedDoc.body().filter(new DomPruningFilter(0.5)); + + var dld = sentenceExtractor.extractSentences(prunedDoc); + + checkDocumentLanguage(dld); + + var ret = new ProcessedDocumentDetails(); + + ret.length = getLength(doc); + ret.standard = getHtmlStandard(doc); + ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); + ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld); + + // don't move this up! it uses title and quality + // and is run before the heavy computations below + if (isDisqualified(url, dld, ret)) { + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.QUALITY); + } + + KeywordMetadata keywordMetadata = new KeywordMetadata(); + + ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); + ret.description = getDescription(doc); + ret.hashCode = dld.localitySensitiveHashCode(); + + PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); + ret.metadata = new EdgePageDocumentsMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class)); + + EdgePageWords words = keywordExtractor.extractKeywords(dld, keywordMetadata); + + new MetaTagsBuilder() + .addDomainCrawlData(crawledDomain) + .addPubDate(pubDate) + .addUrl(url) + .addFeatures(ret.features) + .addFormat(ret.standard) + .build(words); + + getLinks(url, ret, doc, words); + + if (pubDate.hasYear()) { + ret.pubYear = pubDate.year(); + } + + return new DetailsWithWords(ret, words); + } + + + private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$"); + + private boolean isDisqualified(EdgeUrl url, DocumentLanguageData dld, ProcessedDocumentDetails ret) { + if (ret.quality < minDocumentQuality) { + return true; + } + if (dld.totalNumWords() < minDocumentLength) { + return true; + } + // These pages shouldn't be publicly accessible + if ("phpinfo()".equals(ret.title)) { + return true; + } + + // Urls that look like /@foo are typically Mastodon or other twitter-like feeds, + // we don't want to index them because they change so rapidly; subdirectories are + // fine though + // + if (mastodonFeedRegex.test(url.path)) { + return true; + } + + // Annoying wordpress crap + if (url.path.startsWith("/tag/") && url.path.endsWith("/")) { + return true; + } + return false; + } + + + private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWords words) { + + final LinkProcessor lp = new LinkProcessor(ret, baseUrl); + + baseUrl = linkParser.getBaseLink(doc, baseUrl); + + EdgeDomain domain = baseUrl.domain; + + for (var atag : doc.getElementsByTag("a")) { + var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag); + if (linkParser.shouldIndexLink(atag)) { + linkOpt.ifPresent(lp::accept); + } + else { + linkOpt + .filter(url -> linkParser.hasBinarySuffix(url.path.toLowerCase())) + .ifPresent(lp::acceptNonIndexable); + } + } + for (var frame : doc.getElementsByTag("frame")) { + linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); + } + for (var frame : doc.getElementsByTag("iframe")) { + linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); + } + for (var link : doc.select("link[rel=alternate]")) { + feedExtractor + .getFeedFromAlternateTag(baseUrl, link) + .ifPresent(lp::acceptFeed); + } + + createLinkKeywords(words, lp); + createFileLinkKeywords(words, lp, domain); + } + + private void createFileLinkKeywords(EdgePageWords words, LinkProcessor lp, EdgeDomain domain) { + Set fileKeywords = new HashSet<>(100); + for (var link : lp.getNonIndexableUrls()) { + + if (!domain.hasSameTopDomain(link.domain)) { + continue; + } + + synthesizeFilenameKeyword(fileKeywords, link); + + } + + words.addAllSyntheticTerms(fileKeywords); + } + + private void synthesizeFilenameKeyword(Set fileKeywords, EdgeUrl link) { + + Path pFilename = Path.of(link.path.toLowerCase()).getFileName(); + + if (pFilename == null) return; + + String filename = pFilename.toString(); + if (filename.length() > 32 + || filename.endsWith(".xml") + || filename.endsWith(".jpg") + || filename.endsWith(".png") + || filename.endsWith(".pdf") + || filename.endsWith(".gif")) + return; + + fileKeywords.add(filename.replace(' ', '_')); + } + + private void createLinkKeywords(EdgePageWords words, LinkProcessor lp) { + final Set linkTerms = new HashSet<>(); + + for (var fd : lp.getForeignDomains()) { + linkTerms.add("links:"+fd.toString().toLowerCase()); + linkTerms.add("links:"+fd.getDomain().toLowerCase()); + } + words.addAllSyntheticTerms(linkTerms); + } + + private EdgeHtmlStandard getHtmlStandard(Document doc) { + EdgeHtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType()); + + if (EdgeHtmlStandard.UNKNOWN.equals(htmlStandard)) { + return HtmlStandardExtractor.sniffHtmlStandard(doc); + } + return htmlStandard; + } + + private String getDescription(Document doc) { + return summaryExtractor.extractSummary(doc); + } + + private int getLength(Document doc) { + return doc.text().length(); + } +} diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java new file mode 100644 index 00000000..b85092d6 --- /dev/null +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -0,0 +1,118 @@ +package nu.marginalia.converting.processor.plugin; + +import com.google.inject.Inject; +import com.google.inject.name.Named; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.language.model.KeywordMetadata; +import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgePageDocumentFlags; +import nu.marginalia.model.crawl.EdgePageWords; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.model.DisqualifiedException; +import nu.marginalia.converting.model.ProcessedDocumentDetails; +import nu.marginalia.converting.processor.logic.PlainTextLogic; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.util.LineUtils; +import org.apache.commons.lang3.StringUtils; + +import java.net.URISyntaxException; +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; + + +public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin { + + private final int minDocumentLength; + private final int maxTitleLength; + private final SentenceExtractor sentenceExtractor; + private final DocumentKeywordExtractor keywordExtractor; + private final PlainTextLogic plainTextLogic = new PlainTextLogic(); + + + @Inject + public PlainTextDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength, + @Named("max-title-length") Integer maxTitleLength, + SentenceExtractor sentenceExtractor, + DocumentKeywordExtractor keywordExtractor) + { + this.minDocumentLength = minDocumentLength; + this.maxTitleLength = maxTitleLength; + this.sentenceExtractor = sentenceExtractor; + this.keywordExtractor = keywordExtractor; + } + + @Override + public boolean isApplicable(CrawledDocument doc) { + return doc.contentType.equalsIgnoreCase("text/plain"); + } + + @Override + public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) + throws DisqualifiedException, URISyntaxException { + + String documentBody = crawledDocument.documentBody.decode(); + + if (languageFilter.isBlockedUnicodeRange(documentBody)) { + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE); + } + + final EdgeUrl url = new EdgeUrl(crawledDocument.url); + + var dld = sentenceExtractor.extractSentences(documentBody, ""); + + checkDocumentLanguage(dld); + + if (dld.totalNumWords() < minDocumentLength) { + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH); + } + + var ret = new ProcessedDocumentDetails(); + + List firstFewLines = LineUtils.firstNLines(documentBody, 40); + + ret.length = documentBody.length(); + ret.standard = EdgeHtmlStandard.PLAIN; + ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength); + + ret.quality = -1; + + ret.features = new HashSet<>(); + ret.description = StringUtils.truncate(plainTextLogic.getDescription(firstFewLines), 255); + ret.hashCode = dld.localitySensitiveHashCode(); + + final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1)); + + ret.metadata = new EdgePageDocumentsMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText)); + + KeywordMetadata keywordMetadata = new KeywordMetadata(); + EdgePageWords words = keywordExtractor.extractKeywords(dld, keywordMetadata); + + new MetaTagsBuilder() + .addDomainCrawlData(crawledDomain) + .addPubDate(pubDate) + .addUrl(url) + .addFeatures(ret.features) + .addFormat(ret.standard) + .build(words); + + if (pubDate.hasYear()) { + ret.pubYear = pubDate.year(); + } + + /* These are assumed to be populated */ + ret.linksInternal = new ArrayList<>(); + ret.linksExternal = new ArrayList<>(); + ret.feedLinks = new ArrayList<>(); + + return new DetailsWithWords(ret, words); + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/tool/DocumentDebugger.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/tool/DocumentDebugger.java index a693dcdc..0d2fb906 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/tool/DocumentDebugger.java @@ -1,14 +1,14 @@ -package nu.marginalia.util.language; +package nu.marginalia.converting.tool; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.KeywordCounter; -import nu.marginalia.util.language.processing.KeywordExtractor; -import nu.marginalia.util.language.processing.NameCounter; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.util.language.processing.model.DocumentSentence; -import nu.marginalia.util.language.processing.model.WordRep; -import nu.marginalia.util.language.processing.model.tag.WordSeparator; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.LanguageModels; +import nu.marginalia.converting.processor.keywords.KeywordCounter; +import nu.marginalia.converting.processor.keywords.NameCounter; +import nu.marginalia.language.keywords.KeywordExtractor; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.model.WordSeparator; +import nu.marginalia.language.statistics.TermFrequencyDict; import org.jsoup.nodes.Document; import java.io.FileNotFoundException; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilterTest.java b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/DomPruningFilterTest.java similarity index 72% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilterTest.java rename to crawl/converting-process/src/test/java/nu/marginalia/converting/logic/DomPruningFilterTest.java index 957f6306..9e3b0684 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilterTest.java +++ b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/DomPruningFilterTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.logic; import org.junit.jupiter.api.Test; diff --git a/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PlainTextLogicTest.java b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PlainTextLogicTest.java new file mode 100644 index 00000000..3e071291 --- /dev/null +++ b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PlainTextLogicTest.java @@ -0,0 +1,267 @@ +package nu.marginalia.converting.logic; + +import nu.marginalia.converting.processor.logic.PlainTextLogic; +import nu.marginalia.util.LineUtils; +import nu.marginalia.model.EdgeUrl; +import org.junit.jupiter.api.Test; + +import java.net.URISyntaxException; + +class PlainTextLogicTest { + + PlainTextLogic ptl = new PlainTextLogic(); + + String uml = """ + User Mode Linux HOWTO + User Mode Linux Core Team + Fri Mar 7 11:53:53 EST 2008 + + This document describes the use and abuse of Jeff Dike's User Mode + Linux: a port of the Linux kernel as a normal Intel Linux process. + ______________________________________________________________________ + + Table of Contents + + + + 1. Introduction + 1.1 What is User Mode Linux? + 1.2 How is User Mode Linux Different? + 1.3 How does UML Work? + 1.4 Why Would I Want UML? + + 2. Compiling the kernel and modules + 2.1 Compiling the kernel + 2.2 Compiling and installing kernel modules + 2.3 Compiling and installing uml_utilities + + 3. Running UML and logging in + 3.1 Running UML + 3.2 Logging in + 3.3 Examples + + 4. UML on 2G/2G hosts + 4.1 Introduction + 4.2 The problem + 4.3 The solution + + 5. Setting up serial lines and consoles + 5.1 Specifying the device + 5.2 Specifying the channel + 5.3 Examples + + 6. Setting up the network + 6.1 General setup + 6.2 Userspace daemons + 6.3 Specifying ethernet addresses + 6.4 UML interface setup + 6.5 Multicast + 6.6 TUN/TAP with the uml_net helper + 6.7 TUN/TAP with a preconfigured tap device + 6.8 Ethertap + 6.9 The switch daemon + 6.10 Slip + 6.11 Slirp + 6.12 pcap + 6.13 Setting up the host yourself + + 7. Sharing Filesystems between Virtual Machines + 7.1 A warning + 7.2 Using layered block devices + 7.3 Note! + 7.4 Another warning + 7.5 Moving a backing file + 7.6 uml_moo : Merging a COW file with its backing file + 7.7 uml_mkcow : Create a new COW file + + 8. Creating filesystems + 8.1 Create the filesystem file + 8.2 Assign the file to a UML device + 8.3 Creating and mounting the filesystem + + 9. Host file access + 9.1 Using hostfs + 9.2 hostfs command line options + 9.3 hostfs as the root filesystem + 9.4 Building hostfs + + 10. The Management Console + 10.1 version + 10.2 halt and reboot + 10.3 config + 10.4 remove + 10.5 sysrq + 10.6 help + 10.7 cad + 10.8 stop + 10.9 go + 10.10 log + 10.11 proc + 10.12 Making online backups + 10.13 Event notification + + 11. Kernel debugging + 11.1 Starting the kernel under gdb + 11.2 Examining sleeping processes + 11.3 Running ddd on UML + 11.4 Debugging modules + 11.5 Attaching gdb to the kernel + 11.6 Using alternate debuggers + + 12. Kernel debugging examples + 12.1 The case of the hung fsck + 12.2 Episode 2: The case of the hung fsck + + 13. What to do when UML doesn't work + 13.1 Strange compilation errors when you build from source + 13.2 UML hangs on boot after mounting devfs + 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem + 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid' + 13.5 UML doesn't work when /tmp is an NFS filesystem + 13.6 UML hangs on boot when compiled with gprof support + 13.7 syslogd dies with a SIGTERM on startup + 13.8 TUN/TAP networking doesn't work on a 2.4 host + 13.9 You can network to the host but not to other machines on the net + 13.10 I have no root and I want to scream + 13.11 UML build conflict between ptrace.h and ucontext.h + 13.12 The UML BogoMips is exactly half the host's BogoMips + 13.13 When you run UML, it immediately segfaults + 13.14 xterms appear, then immediately disappear + 13.15 cannot set up thread-local storage + 13.16 Process segfaults with a modern (NPTL-using) filesystem + 13.17 Any other panic, hang, or strange behavior + + 14. Diagnosing Problems + 14.1 Case 1 : Normal kernel panics + 14.2 Case 2 : Tracing thread panics + 14.3 Case 3 : Tracing thread panics caused by other threads + 14.4 Case 4 : Hangs + + 15. Thanks + 15.1 Code and Documentation + 15.2 Flushing out bugs + 15.3 Buglets and clean-ups + 15.4 Case Studies + 15.5 Other contributions + + + ______________________________________________________________________ + + 1. Introduction + + Welcome to User Mode Linux. It's going to be fun. + + + 1.1. What is User Mode Linux? + + User Mode Linux lets you run Linux inside itself! With that comes the + power to do all sorts of new things. It virtualises (or simulates, as + """; + + String cmucl = """ + ========================== C M U C L 20 a ============================= + + The CMUCL project is pleased to announce the release of CMUCL 20a. + This is a major release which contains numerous enhancements and + bug fixes from the 19f release. + + CMUCL is a free, high performance implementation of the Common Lisp + programming language which runs on most major Unix platforms. It + mainly conforms to the ANSI Common Lisp standard. CMUCL provides a + sophisticated native code compiler; a powerful foreign function + interface; an implementation of CLOS, the Common Lisp Object System, + which includes multi-methods and a meta-object protocol; a source-level + debugger and code profiler; and an Emacs-like editor implemented in + Common Lisp. CMUCL is maintained by a team of volunteers collaborating + over the Internet, and is mostly in the public domain. + + New in this release: + + * Known issues: + - On Linux and FreeBSD, it may not be possible call SAVE-LISP and + create executables. This seems to be broken on FreeBSD. On + Linux, it seems to depend on what version of Linux is used to + create the executable. Redhat Enterprise Linux appears to be + ok, but Open SuSE 10.x is not. + """; + + String xprint = """ + Archive-name: Xprint/FAQ_OLD + Version: 0.8 + Last-Modified: 2003/08/04 15:20:19 + Maintained-by: Roland Mainz + + NOTE: This version of the FAQ has been discontinued and was replaced by the + DocBook-based version available under xc/doc/hardcopy/XPRINT/Xprint_FAQ.xml + (available through http from + ) + + The following is a list of questions that are frequently asked about + Xprint. + + You can help make it an even better-quality FAQ by writing a short + contribution or update and sending it BY EMAIL ONLY to me. + A contribution should consist of a question and an answer, and increasing + number of people sends me contributions of the form "I don't know the + answer to this, but it must be a FAQ, please answer it for me". Please + read the FAQ first and then feel free to ask me if it is not in the FAQ. + + Thanks! + """; + + String vm = """ + + .. _vm: + + ============================================================= + Clawpack Virtual Machine\s + ============================================================= + + Using Clawpack requires a variety of other software packages, as summarized in + :ref:`installing`. An alternative to installing the prerequisites is to use the + virtual machine described in this section. + + Another alternative is to run Clawpack on the Cloud, see :ref:`aws`. + + To do so, you need only download and + """; + + String garfinkel = """ + The Net Effect: The DVD Rebellion\s + By Simson Garfinkel\s + MIT Technology Review + July/August 2001 + + Buy a copy of The Matrix on DVD and take it home. Play it on a Mac or + on a Windows PC and you're in for a pretty good time. But play it on + a PC running the Linux operating system, and the movie industry says + that you're breaking the law. + + Your transgression is that of "circumvention," a criminal act created + by the 1998 Digital Millennium Copyright Act. You see, the video on + DVDs is scrambled. Windows and Macintosh DVD players licensed by the + DVD Copy Control Association contain the algorithms to unscramble the + signal. The Linux DVD player contains these secrets as well. But + since the Linux-based program isn't licensed, using the software + constitutes an illegal circumvention of copyright management. + + """; + @Test + void getDescription() { + System.out.println(ptl.getDescription(LineUtils.firstNLines(uml, 25))); + System.out.println(ptl.getDescription(LineUtils.firstNLines(cmucl, 25))); + System.out.println(ptl.getDescription(LineUtils.firstNLines(xprint, 25))); + System.out.println(ptl.getDescription(LineUtils.firstNLines(vm, 25))); + System.out.println(ptl.getDescription(LineUtils.firstNLines(garfinkel, 25))); + } + + @Test + void getTitle() throws URISyntaxException { + System.out.println(ptl.getTitle(new EdgeUrl("http://user-mode-linux.sourceforge.net/old/UserModeLinux-HOWTO.txt"), LineUtils.firstNLines(uml, 25))); + System.out.println(ptl.getTitle(new EdgeUrl("https://www.cons.org/cmucl/news/release-20a.txt"), LineUtils.firstNLines(cmucl, 25))); + System.out.println(ptl.getTitle(new EdgeUrl("https://www.x.org/docs/XPRINT/Xprint_old_FAQ.txt"), LineUtils.firstNLines(xprint, 25))); + System.out.println(ptl.getTitle(new EdgeUrl("http://depts.washington.edu/clawpack/users-4.6/_sources/vm.txt"), LineUtils.firstNLines(vm, 25))); + System.out.println(ptl.getTitle(new EdgeUrl("http://www.cs.cmu.edu/afs/cs.cmu.edu/user/dst/www/DeCSS/Gallery/archive/garfinkel.txt"), LineUtils.firstNLines(garfinkel, 25))); + + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PubDateSnifferTest.java similarity index 96% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java rename to crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PubDateSnifferTest.java index dd99d27e..53095944 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java +++ b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/PubDateSnifferTest.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.logic; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import nu.marginalia.WmsaHome; +import nu.marginalia.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.converting.processor.logic.pubdate.PubDateSniffer; +import nu.marginalia.converting.processor.logic.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.EdgeHtmlStandard; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractorTest.java b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java similarity index 96% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractorTest.java rename to crawl/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java index 64942b5f..15a2d377 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractorTest.java +++ b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java @@ -1,6 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.converting.logic; -import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.WmsaHome; +import nu.marginalia.converting.processor.logic.SummaryExtractionFilter; +import nu.marginalia.converting.processor.logic.SummaryExtractor; import org.jsoup.Jsoup; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateTest.java b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/pubdate/PubDateTest.java similarity index 80% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateTest.java rename to crawl/converting-process/src/test/java/nu/marginalia/converting/logic/pubdate/PubDateTest.java index d55c3bfc..1ac342cf 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateTest.java +++ b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/pubdate/PubDateTest.java @@ -1,5 +1,6 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; +package nu.marginalia.converting.logic.pubdate; +import nu.marginalia.model.crawl.PubDate; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java b/crawl/converting-process/src/test/java/nu/marginalia/converting/processor/keywords/SentenceExtractorTest.java similarity index 80% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java rename to crawl/converting-process/src/test/java/nu/marginalia/converting/processor/keywords/SentenceExtractorTest.java index 344973e4..3d942c19 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java +++ b/crawl/converting-process/src/test/java/nu/marginalia/converting/processor/keywords/SentenceExtractorTest.java @@ -1,22 +1,19 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.converting.processor.keywords; import lombok.SneakyThrows; -import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.KeywordExtractor; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.util.language.processing.model.KeywordMetadata; -import nu.marginalia.util.language.processing.model.WordRep; -import nu.marginalia.util.language.processing.model.WordSpan; -import nu.marginalia.util.language.processing.model.tag.WordSeparator; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; -import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.LanguageModels; +import nu.marginalia.language.WordPatterns; +import nu.marginalia.language.model.KeywordMetadata; +import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.model.WordSpan; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.language.keywords.KeywordExtractor; +import nu.marginalia.language.model.WordSeparator; +import nu.marginalia.WmsaHome; +import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.EdgePageWordMetadata; +import nu.marginalia.test.util.TestLanguageModels; import org.apache.commons.lang3.tuple.Pair; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeEach; @@ -56,6 +53,7 @@ class SentenceExtractorTest { var dict = new TermFrequencyDict(lm); DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); + for (;;) { long total = 0; for (var file : Objects.requireNonNull(data.toFile().listFiles())) { @@ -111,30 +109,11 @@ class SentenceExtractorTest { } - @Test - public void testWikipedia() throws InterruptedException { - - System.out.println("Running"); - - var dict = new TermFrequencyDict(lm); - - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); - - var reader = new WikipediaReader("/home/vlofgren/Work/wikipedia_en_100_nopic_2021-06.zim", new EdgeDomain("encyclopedia.marginalia.nu"), - post -> { - - var newResult = newSe.extractSentences(Jsoup.parse(post.body)); - - var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata()); - System.out.println(newRes); - }); - reader.join(); - } - @Test public void testPattern() { System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches()); } + @Test void extractSentences() throws IOException { var data = WmsaHome.getHomePath().resolve("test-data/"); diff --git a/crawl/converting-process/src/test/java/nu/marginalia/test/util/TestLanguageModels.java b/crawl/converting-process/src/test/java/nu/marginalia/test/util/TestLanguageModels.java new file mode 100644 index 00000000..958604ca --- /dev/null +++ b/crawl/converting-process/src/test/java/nu/marginalia/test/util/TestLanguageModels.java @@ -0,0 +1,37 @@ +package nu.marginalia.test.util; + +import nu.marginalia.LanguageModels; +import nu.marginalia.WmsaHome; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; + +public class TestLanguageModels { + private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model"); + + public static Path getLanguageModelsPath() { + final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME")) + .map(Path::of) + .orElse(LANGUAGE_MODELS_DEFAULT); + + if (!Files.isDirectory(languageModelsHome)) { + throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md"); + } + return languageModelsHome; + } + + public static LanguageModels getLanguageModels() { + + var languageModelsHome = getLanguageModelsPath(); + + return new LanguageModels( + languageModelsHome.resolve("ngrams.bin"), + languageModelsHome.resolve("tfreq-new-algo3.bin"), + languageModelsHome.resolve("opennlp-sentence.bin"), + languageModelsHome.resolve("English.RDR"), + languageModelsHome.resolve("English.DICT"), + languageModelsHome.resolve("opennlp-tokens.bin") + ); + } +} diff --git a/marginalia_nu/src/test/resources/html/monadnock.html b/crawl/converting-process/src/test/resources/html/monadnock.html similarity index 100% rename from marginalia_nu/src/test/resources/html/monadnock.html rename to crawl/converting-process/src/test/resources/html/monadnock.html diff --git a/marginalia_nu/src/test/resources/html/readme.md b/crawl/converting-process/src/test/resources/html/readme.md similarity index 100% rename from marginalia_nu/src/test/resources/html/readme.md rename to crawl/converting-process/src/test/resources/html/readme.md diff --git a/marginalia_nu/src/test/resources/html/summarization/187.shtml b/crawl/converting-process/src/test/resources/html/summarization/187.shtml similarity index 100% rename from marginalia_nu/src/test/resources/html/summarization/187.shtml rename to crawl/converting-process/src/test/resources/html/summarization/187.shtml diff --git a/marginalia_nu/src/test/resources/html/summarization/surrey.html b/crawl/converting-process/src/test/resources/html/summarization/surrey.html similarity index 100% rename from marginalia_nu/src/test/resources/html/summarization/surrey.html rename to crawl/converting-process/src/test/resources/html/summarization/surrey.html diff --git a/marginalia_nu/src/test/resources/html/summarization/surrey.html.1 b/crawl/converting-process/src/test/resources/html/summarization/surrey.html.1 similarity index 100% rename from marginalia_nu/src/test/resources/html/summarization/surrey.html.1 rename to crawl/converting-process/src/test/resources/html/summarization/surrey.html.1 diff --git a/marginalia_nu/src/test/resources/html/work-set/index b/crawl/converting-process/src/test/resources/html/work-set/index similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/index rename to crawl/converting-process/src/test/resources/html/work-set/index diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1021546012 b/crawl/converting-process/src/test/resources/html/work-set/url--1021546012 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1021546012 rename to crawl/converting-process/src/test/resources/html/work-set/url--1021546012 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1028592943 b/crawl/converting-process/src/test/resources/html/work-set/url--1028592943 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1028592943 rename to crawl/converting-process/src/test/resources/html/work-set/url--1028592943 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1081293162 b/crawl/converting-process/src/test/resources/html/work-set/url--1081293162 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1081293162 rename to crawl/converting-process/src/test/resources/html/work-set/url--1081293162 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1105046394 b/crawl/converting-process/src/test/resources/html/work-set/url--1105046394 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1105046394 rename to crawl/converting-process/src/test/resources/html/work-set/url--1105046394 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1146923296 b/crawl/converting-process/src/test/resources/html/work-set/url--1146923296 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1146923296 rename to crawl/converting-process/src/test/resources/html/work-set/url--1146923296 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1194694074 b/crawl/converting-process/src/test/resources/html/work-set/url--1194694074 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1194694074 rename to crawl/converting-process/src/test/resources/html/work-set/url--1194694074 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1207898281 b/crawl/converting-process/src/test/resources/html/work-set/url--1207898281 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1207898281 rename to crawl/converting-process/src/test/resources/html/work-set/url--1207898281 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1268145073 b/crawl/converting-process/src/test/resources/html/work-set/url--1268145073 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1268145073 rename to crawl/converting-process/src/test/resources/html/work-set/url--1268145073 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1294876331 b/crawl/converting-process/src/test/resources/html/work-set/url--1294876331 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1294876331 rename to crawl/converting-process/src/test/resources/html/work-set/url--1294876331 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1314767420 b/crawl/converting-process/src/test/resources/html/work-set/url--1314767420 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1314767420 rename to crawl/converting-process/src/test/resources/html/work-set/url--1314767420 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1316269786 b/crawl/converting-process/src/test/resources/html/work-set/url--1316269786 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1316269786 rename to crawl/converting-process/src/test/resources/html/work-set/url--1316269786 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1316766580 b/crawl/converting-process/src/test/resources/html/work-set/url--1316766580 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1316766580 rename to crawl/converting-process/src/test/resources/html/work-set/url--1316766580 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1319968043 b/crawl/converting-process/src/test/resources/html/work-set/url--1319968043 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1319968043 rename to crawl/converting-process/src/test/resources/html/work-set/url--1319968043 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1338576987 b/crawl/converting-process/src/test/resources/html/work-set/url--1338576987 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1338576987 rename to crawl/converting-process/src/test/resources/html/work-set/url--1338576987 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1341909571 b/crawl/converting-process/src/test/resources/html/work-set/url--1341909571 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1341909571 rename to crawl/converting-process/src/test/resources/html/work-set/url--1341909571 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1369578579 b/crawl/converting-process/src/test/resources/html/work-set/url--1369578579 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1369578579 rename to crawl/converting-process/src/test/resources/html/work-set/url--1369578579 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1437315645 b/crawl/converting-process/src/test/resources/html/work-set/url--1437315645 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1437315645 rename to crawl/converting-process/src/test/resources/html/work-set/url--1437315645 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1458954960 b/crawl/converting-process/src/test/resources/html/work-set/url--1458954960 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1458954960 rename to crawl/converting-process/src/test/resources/html/work-set/url--1458954960 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1475681345 b/crawl/converting-process/src/test/resources/html/work-set/url--1475681345 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1475681345 rename to crawl/converting-process/src/test/resources/html/work-set/url--1475681345 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1498328446 b/crawl/converting-process/src/test/resources/html/work-set/url--1498328446 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1498328446 rename to crawl/converting-process/src/test/resources/html/work-set/url--1498328446 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1507779664 b/crawl/converting-process/src/test/resources/html/work-set/url--1507779664 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1507779664 rename to crawl/converting-process/src/test/resources/html/work-set/url--1507779664 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1540303379 b/crawl/converting-process/src/test/resources/html/work-set/url--1540303379 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1540303379 rename to crawl/converting-process/src/test/resources/html/work-set/url--1540303379 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--154898476 b/crawl/converting-process/src/test/resources/html/work-set/url--154898476 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--154898476 rename to crawl/converting-process/src/test/resources/html/work-set/url--154898476 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1552059399 b/crawl/converting-process/src/test/resources/html/work-set/url--1552059399 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1552059399 rename to crawl/converting-process/src/test/resources/html/work-set/url--1552059399 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1557688340 b/crawl/converting-process/src/test/resources/html/work-set/url--1557688340 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1557688340 rename to crawl/converting-process/src/test/resources/html/work-set/url--1557688340 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1584145751 b/crawl/converting-process/src/test/resources/html/work-set/url--1584145751 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1584145751 rename to crawl/converting-process/src/test/resources/html/work-set/url--1584145751 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1605151204 b/crawl/converting-process/src/test/resources/html/work-set/url--1605151204 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1605151204 rename to crawl/converting-process/src/test/resources/html/work-set/url--1605151204 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--162269247 b/crawl/converting-process/src/test/resources/html/work-set/url--162269247 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--162269247 rename to crawl/converting-process/src/test/resources/html/work-set/url--162269247 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1624294488 b/crawl/converting-process/src/test/resources/html/work-set/url--1624294488 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1624294488 rename to crawl/converting-process/src/test/resources/html/work-set/url--1624294488 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--164108285 b/crawl/converting-process/src/test/resources/html/work-set/url--164108285 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--164108285 rename to crawl/converting-process/src/test/resources/html/work-set/url--164108285 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1645688243 b/crawl/converting-process/src/test/resources/html/work-set/url--1645688243 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1645688243 rename to crawl/converting-process/src/test/resources/html/work-set/url--1645688243 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1658004609 b/crawl/converting-process/src/test/resources/html/work-set/url--1658004609 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1658004609 rename to crawl/converting-process/src/test/resources/html/work-set/url--1658004609 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1658558834 b/crawl/converting-process/src/test/resources/html/work-set/url--1658558834 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1658558834 rename to crawl/converting-process/src/test/resources/html/work-set/url--1658558834 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1698664879 b/crawl/converting-process/src/test/resources/html/work-set/url--1698664879 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1698664879 rename to crawl/converting-process/src/test/resources/html/work-set/url--1698664879 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--169975195 b/crawl/converting-process/src/test/resources/html/work-set/url--169975195 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--169975195 rename to crawl/converting-process/src/test/resources/html/work-set/url--169975195 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1701203332 b/crawl/converting-process/src/test/resources/html/work-set/url--1701203332 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1701203332 rename to crawl/converting-process/src/test/resources/html/work-set/url--1701203332 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--17281998 b/crawl/converting-process/src/test/resources/html/work-set/url--17281998 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--17281998 rename to crawl/converting-process/src/test/resources/html/work-set/url--17281998 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1742070028 b/crawl/converting-process/src/test/resources/html/work-set/url--1742070028 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1742070028 rename to crawl/converting-process/src/test/resources/html/work-set/url--1742070028 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1745376814 b/crawl/converting-process/src/test/resources/html/work-set/url--1745376814 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1745376814 rename to crawl/converting-process/src/test/resources/html/work-set/url--1745376814 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1749889035 b/crawl/converting-process/src/test/resources/html/work-set/url--1749889035 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1749889035 rename to crawl/converting-process/src/test/resources/html/work-set/url--1749889035 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--176177364 b/crawl/converting-process/src/test/resources/html/work-set/url--176177364 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--176177364 rename to crawl/converting-process/src/test/resources/html/work-set/url--176177364 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--177014197 b/crawl/converting-process/src/test/resources/html/work-set/url--177014197 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--177014197 rename to crawl/converting-process/src/test/resources/html/work-set/url--177014197 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1794527707 b/crawl/converting-process/src/test/resources/html/work-set/url--1794527707 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1794527707 rename to crawl/converting-process/src/test/resources/html/work-set/url--1794527707 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1797740201 b/crawl/converting-process/src/test/resources/html/work-set/url--1797740201 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1797740201 rename to crawl/converting-process/src/test/resources/html/work-set/url--1797740201 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1799098579 b/crawl/converting-process/src/test/resources/html/work-set/url--1799098579 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1799098579 rename to crawl/converting-process/src/test/resources/html/work-set/url--1799098579 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1959637826 b/crawl/converting-process/src/test/resources/html/work-set/url--1959637826 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1959637826 rename to crawl/converting-process/src/test/resources/html/work-set/url--1959637826 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1971916964 b/crawl/converting-process/src/test/resources/html/work-set/url--1971916964 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1971916964 rename to crawl/converting-process/src/test/resources/html/work-set/url--1971916964 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--1985840368 b/crawl/converting-process/src/test/resources/html/work-set/url--1985840368 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--1985840368 rename to crawl/converting-process/src/test/resources/html/work-set/url--1985840368 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--2012610859 b/crawl/converting-process/src/test/resources/html/work-set/url--2012610859 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--2012610859 rename to crawl/converting-process/src/test/resources/html/work-set/url--2012610859 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--202178680 b/crawl/converting-process/src/test/resources/html/work-set/url--202178680 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--202178680 rename to crawl/converting-process/src/test/resources/html/work-set/url--202178680 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--2043528727 b/crawl/converting-process/src/test/resources/html/work-set/url--2043528727 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--2043528727 rename to crawl/converting-process/src/test/resources/html/work-set/url--2043528727 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--2081757477 b/crawl/converting-process/src/test/resources/html/work-set/url--2081757477 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--2081757477 rename to crawl/converting-process/src/test/resources/html/work-set/url--2081757477 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--2103982576 b/crawl/converting-process/src/test/resources/html/work-set/url--2103982576 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--2103982576 rename to crawl/converting-process/src/test/resources/html/work-set/url--2103982576 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--2111558769 b/crawl/converting-process/src/test/resources/html/work-set/url--2111558769 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--2111558769 rename to crawl/converting-process/src/test/resources/html/work-set/url--2111558769 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--213168798 b/crawl/converting-process/src/test/resources/html/work-set/url--213168798 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--213168798 rename to crawl/converting-process/src/test/resources/html/work-set/url--213168798 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--232544032 b/crawl/converting-process/src/test/resources/html/work-set/url--232544032 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--232544032 rename to crawl/converting-process/src/test/resources/html/work-set/url--232544032 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--253010011 b/crawl/converting-process/src/test/resources/html/work-set/url--253010011 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--253010011 rename to crawl/converting-process/src/test/resources/html/work-set/url--253010011 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--274250994 b/crawl/converting-process/src/test/resources/html/work-set/url--274250994 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--274250994 rename to crawl/converting-process/src/test/resources/html/work-set/url--274250994 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--332442790 b/crawl/converting-process/src/test/resources/html/work-set/url--332442790 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--332442790 rename to crawl/converting-process/src/test/resources/html/work-set/url--332442790 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--353437903 b/crawl/converting-process/src/test/resources/html/work-set/url--353437903 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--353437903 rename to crawl/converting-process/src/test/resources/html/work-set/url--353437903 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--364546777 b/crawl/converting-process/src/test/resources/html/work-set/url--364546777 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--364546777 rename to crawl/converting-process/src/test/resources/html/work-set/url--364546777 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--379129416 b/crawl/converting-process/src/test/resources/html/work-set/url--379129416 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--379129416 rename to crawl/converting-process/src/test/resources/html/work-set/url--379129416 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--399428149 b/crawl/converting-process/src/test/resources/html/work-set/url--399428149 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--399428149 rename to crawl/converting-process/src/test/resources/html/work-set/url--399428149 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--425233170 b/crawl/converting-process/src/test/resources/html/work-set/url--425233170 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--425233170 rename to crawl/converting-process/src/test/resources/html/work-set/url--425233170 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--434612307 b/crawl/converting-process/src/test/resources/html/work-set/url--434612307 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--434612307 rename to crawl/converting-process/src/test/resources/html/work-set/url--434612307 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--439772328 b/crawl/converting-process/src/test/resources/html/work-set/url--439772328 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--439772328 rename to crawl/converting-process/src/test/resources/html/work-set/url--439772328 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--458002611 b/crawl/converting-process/src/test/resources/html/work-set/url--458002611 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--458002611 rename to crawl/converting-process/src/test/resources/html/work-set/url--458002611 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--506010305 b/crawl/converting-process/src/test/resources/html/work-set/url--506010305 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--506010305 rename to crawl/converting-process/src/test/resources/html/work-set/url--506010305 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--546773534 b/crawl/converting-process/src/test/resources/html/work-set/url--546773534 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--546773534 rename to crawl/converting-process/src/test/resources/html/work-set/url--546773534 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--551288516 b/crawl/converting-process/src/test/resources/html/work-set/url--551288516 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--551288516 rename to crawl/converting-process/src/test/resources/html/work-set/url--551288516 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--602577763 b/crawl/converting-process/src/test/resources/html/work-set/url--602577763 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--602577763 rename to crawl/converting-process/src/test/resources/html/work-set/url--602577763 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--611668054 b/crawl/converting-process/src/test/resources/html/work-set/url--611668054 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--611668054 rename to crawl/converting-process/src/test/resources/html/work-set/url--611668054 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--634771245 b/crawl/converting-process/src/test/resources/html/work-set/url--634771245 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--634771245 rename to crawl/converting-process/src/test/resources/html/work-set/url--634771245 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--639320493 b/crawl/converting-process/src/test/resources/html/work-set/url--639320493 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--639320493 rename to crawl/converting-process/src/test/resources/html/work-set/url--639320493 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--643179018 b/crawl/converting-process/src/test/resources/html/work-set/url--643179018 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--643179018 rename to crawl/converting-process/src/test/resources/html/work-set/url--643179018 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--663772351 b/crawl/converting-process/src/test/resources/html/work-set/url--663772351 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--663772351 rename to crawl/converting-process/src/test/resources/html/work-set/url--663772351 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--670789152 b/crawl/converting-process/src/test/resources/html/work-set/url--670789152 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--670789152 rename to crawl/converting-process/src/test/resources/html/work-set/url--670789152 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--6797317 b/crawl/converting-process/src/test/resources/html/work-set/url--6797317 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--6797317 rename to crawl/converting-process/src/test/resources/html/work-set/url--6797317 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--700978490 b/crawl/converting-process/src/test/resources/html/work-set/url--700978490 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--700978490 rename to crawl/converting-process/src/test/resources/html/work-set/url--700978490 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--708035332 b/crawl/converting-process/src/test/resources/html/work-set/url--708035332 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--708035332 rename to crawl/converting-process/src/test/resources/html/work-set/url--708035332 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--804917062 b/crawl/converting-process/src/test/resources/html/work-set/url--804917062 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--804917062 rename to crawl/converting-process/src/test/resources/html/work-set/url--804917062 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--819771302 b/crawl/converting-process/src/test/resources/html/work-set/url--819771302 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--819771302 rename to crawl/converting-process/src/test/resources/html/work-set/url--819771302 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--840796372 b/crawl/converting-process/src/test/resources/html/work-set/url--840796372 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--840796372 rename to crawl/converting-process/src/test/resources/html/work-set/url--840796372 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--841445362 b/crawl/converting-process/src/test/resources/html/work-set/url--841445362 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--841445362 rename to crawl/converting-process/src/test/resources/html/work-set/url--841445362 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--862385354 b/crawl/converting-process/src/test/resources/html/work-set/url--862385354 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--862385354 rename to crawl/converting-process/src/test/resources/html/work-set/url--862385354 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--879796466 b/crawl/converting-process/src/test/resources/html/work-set/url--879796466 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--879796466 rename to crawl/converting-process/src/test/resources/html/work-set/url--879796466 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--89134993 b/crawl/converting-process/src/test/resources/html/work-set/url--89134993 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--89134993 rename to crawl/converting-process/src/test/resources/html/work-set/url--89134993 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--905197876 b/crawl/converting-process/src/test/resources/html/work-set/url--905197876 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--905197876 rename to crawl/converting-process/src/test/resources/html/work-set/url--905197876 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--920328354 b/crawl/converting-process/src/test/resources/html/work-set/url--920328354 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--920328354 rename to crawl/converting-process/src/test/resources/html/work-set/url--920328354 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--952827759 b/crawl/converting-process/src/test/resources/html/work-set/url--952827759 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--952827759 rename to crawl/converting-process/src/test/resources/html/work-set/url--952827759 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--964018507 b/crawl/converting-process/src/test/resources/html/work-set/url--964018507 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--964018507 rename to crawl/converting-process/src/test/resources/html/work-set/url--964018507 diff --git a/marginalia_nu/src/test/resources/html/work-set/url--972614909 b/crawl/converting-process/src/test/resources/html/work-set/url--972614909 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url--972614909 rename to crawl/converting-process/src/test/resources/html/work-set/url--972614909 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-10088520 b/crawl/converting-process/src/test/resources/html/work-set/url-10088520 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-10088520 rename to crawl/converting-process/src/test/resources/html/work-set/url-10088520 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1013281103 b/crawl/converting-process/src/test/resources/html/work-set/url-1013281103 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1013281103 rename to crawl/converting-process/src/test/resources/html/work-set/url-1013281103 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1019241851 b/crawl/converting-process/src/test/resources/html/work-set/url-1019241851 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1019241851 rename to crawl/converting-process/src/test/resources/html/work-set/url-1019241851 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1059944953 b/crawl/converting-process/src/test/resources/html/work-set/url-1059944953 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1059944953 rename to crawl/converting-process/src/test/resources/html/work-set/url-1059944953 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1118681302 b/crawl/converting-process/src/test/resources/html/work-set/url-1118681302 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1118681302 rename to crawl/converting-process/src/test/resources/html/work-set/url-1118681302 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1179298706 b/crawl/converting-process/src/test/resources/html/work-set/url-1179298706 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1179298706 rename to crawl/converting-process/src/test/resources/html/work-set/url-1179298706 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1191749784 b/crawl/converting-process/src/test/resources/html/work-set/url-1191749784 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1191749784 rename to crawl/converting-process/src/test/resources/html/work-set/url-1191749784 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1207094790 b/crawl/converting-process/src/test/resources/html/work-set/url-1207094790 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1207094790 rename to crawl/converting-process/src/test/resources/html/work-set/url-1207094790 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1213989666 b/crawl/converting-process/src/test/resources/html/work-set/url-1213989666 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1213989666 rename to crawl/converting-process/src/test/resources/html/work-set/url-1213989666 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1222442301 b/crawl/converting-process/src/test/resources/html/work-set/url-1222442301 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1222442301 rename to crawl/converting-process/src/test/resources/html/work-set/url-1222442301 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-130332455 b/crawl/converting-process/src/test/resources/html/work-set/url-130332455 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-130332455 rename to crawl/converting-process/src/test/resources/html/work-set/url-130332455 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1311055461 b/crawl/converting-process/src/test/resources/html/work-set/url-1311055461 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1311055461 rename to crawl/converting-process/src/test/resources/html/work-set/url-1311055461 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1391842722 b/crawl/converting-process/src/test/resources/html/work-set/url-1391842722 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1391842722 rename to crawl/converting-process/src/test/resources/html/work-set/url-1391842722 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1457388763 b/crawl/converting-process/src/test/resources/html/work-set/url-1457388763 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1457388763 rename to crawl/converting-process/src/test/resources/html/work-set/url-1457388763 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1506356272 b/crawl/converting-process/src/test/resources/html/work-set/url-1506356272 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1506356272 rename to crawl/converting-process/src/test/resources/html/work-set/url-1506356272 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1511762169 b/crawl/converting-process/src/test/resources/html/work-set/url-1511762169 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1511762169 rename to crawl/converting-process/src/test/resources/html/work-set/url-1511762169 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1534640058 b/crawl/converting-process/src/test/resources/html/work-set/url-1534640058 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1534640058 rename to crawl/converting-process/src/test/resources/html/work-set/url-1534640058 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1551513871 b/crawl/converting-process/src/test/resources/html/work-set/url-1551513871 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1551513871 rename to crawl/converting-process/src/test/resources/html/work-set/url-1551513871 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1567632447 b/crawl/converting-process/src/test/resources/html/work-set/url-1567632447 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1567632447 rename to crawl/converting-process/src/test/resources/html/work-set/url-1567632447 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1623049502 b/crawl/converting-process/src/test/resources/html/work-set/url-1623049502 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1623049502 rename to crawl/converting-process/src/test/resources/html/work-set/url-1623049502 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-163919330 b/crawl/converting-process/src/test/resources/html/work-set/url-163919330 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-163919330 rename to crawl/converting-process/src/test/resources/html/work-set/url-163919330 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1661398327 b/crawl/converting-process/src/test/resources/html/work-set/url-1661398327 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1661398327 rename to crawl/converting-process/src/test/resources/html/work-set/url-1661398327 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1724309925 b/crawl/converting-process/src/test/resources/html/work-set/url-1724309925 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1724309925 rename to crawl/converting-process/src/test/resources/html/work-set/url-1724309925 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1736807128 b/crawl/converting-process/src/test/resources/html/work-set/url-1736807128 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1736807128 rename to crawl/converting-process/src/test/resources/html/work-set/url-1736807128 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1739031345 b/crawl/converting-process/src/test/resources/html/work-set/url-1739031345 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1739031345 rename to crawl/converting-process/src/test/resources/html/work-set/url-1739031345 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1755745765 b/crawl/converting-process/src/test/resources/html/work-set/url-1755745765 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1755745765 rename to crawl/converting-process/src/test/resources/html/work-set/url-1755745765 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1802811100 b/crawl/converting-process/src/test/resources/html/work-set/url-1802811100 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1802811100 rename to crawl/converting-process/src/test/resources/html/work-set/url-1802811100 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1805364707 b/crawl/converting-process/src/test/resources/html/work-set/url-1805364707 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1805364707 rename to crawl/converting-process/src/test/resources/html/work-set/url-1805364707 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1832702370 b/crawl/converting-process/src/test/resources/html/work-set/url-1832702370 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1832702370 rename to crawl/converting-process/src/test/resources/html/work-set/url-1832702370 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1853114311 b/crawl/converting-process/src/test/resources/html/work-set/url-1853114311 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1853114311 rename to crawl/converting-process/src/test/resources/html/work-set/url-1853114311 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1924872844 b/crawl/converting-process/src/test/resources/html/work-set/url-1924872844 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1924872844 rename to crawl/converting-process/src/test/resources/html/work-set/url-1924872844 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-197772804 b/crawl/converting-process/src/test/resources/html/work-set/url-197772804 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-197772804 rename to crawl/converting-process/src/test/resources/html/work-set/url-197772804 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1984259912 b/crawl/converting-process/src/test/resources/html/work-set/url-1984259912 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1984259912 rename to crawl/converting-process/src/test/resources/html/work-set/url-1984259912 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-1990903988 b/crawl/converting-process/src/test/resources/html/work-set/url-1990903988 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-1990903988 rename to crawl/converting-process/src/test/resources/html/work-set/url-1990903988 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-2039310951 b/crawl/converting-process/src/test/resources/html/work-set/url-2039310951 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-2039310951 rename to crawl/converting-process/src/test/resources/html/work-set/url-2039310951 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-2040857056 b/crawl/converting-process/src/test/resources/html/work-set/url-2040857056 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-2040857056 rename to crawl/converting-process/src/test/resources/html/work-set/url-2040857056 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-2052613093 b/crawl/converting-process/src/test/resources/html/work-set/url-2052613093 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-2052613093 rename to crawl/converting-process/src/test/resources/html/work-set/url-2052613093 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-2063899866 b/crawl/converting-process/src/test/resources/html/work-set/url-2063899866 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-2063899866 rename to crawl/converting-process/src/test/resources/html/work-set/url-2063899866 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-2115548255 b/crawl/converting-process/src/test/resources/html/work-set/url-2115548255 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-2115548255 rename to crawl/converting-process/src/test/resources/html/work-set/url-2115548255 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-2127148436 b/crawl/converting-process/src/test/resources/html/work-set/url-2127148436 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-2127148436 rename to crawl/converting-process/src/test/resources/html/work-set/url-2127148436 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-2133781904 b/crawl/converting-process/src/test/resources/html/work-set/url-2133781904 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-2133781904 rename to crawl/converting-process/src/test/resources/html/work-set/url-2133781904 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-225690385 b/crawl/converting-process/src/test/resources/html/work-set/url-225690385 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-225690385 rename to crawl/converting-process/src/test/resources/html/work-set/url-225690385 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-226401955 b/crawl/converting-process/src/test/resources/html/work-set/url-226401955 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-226401955 rename to crawl/converting-process/src/test/resources/html/work-set/url-226401955 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-262970770 b/crawl/converting-process/src/test/resources/html/work-set/url-262970770 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-262970770 rename to crawl/converting-process/src/test/resources/html/work-set/url-262970770 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-30106798 b/crawl/converting-process/src/test/resources/html/work-set/url-30106798 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-30106798 rename to crawl/converting-process/src/test/resources/html/work-set/url-30106798 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-302167335 b/crawl/converting-process/src/test/resources/html/work-set/url-302167335 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-302167335 rename to crawl/converting-process/src/test/resources/html/work-set/url-302167335 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-327999153 b/crawl/converting-process/src/test/resources/html/work-set/url-327999153 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-327999153 rename to crawl/converting-process/src/test/resources/html/work-set/url-327999153 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-332568225 b/crawl/converting-process/src/test/resources/html/work-set/url-332568225 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-332568225 rename to crawl/converting-process/src/test/resources/html/work-set/url-332568225 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-343223418 b/crawl/converting-process/src/test/resources/html/work-set/url-343223418 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-343223418 rename to crawl/converting-process/src/test/resources/html/work-set/url-343223418 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-383103932 b/crawl/converting-process/src/test/resources/html/work-set/url-383103932 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-383103932 rename to crawl/converting-process/src/test/resources/html/work-set/url-383103932 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-412929678 b/crawl/converting-process/src/test/resources/html/work-set/url-412929678 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-412929678 rename to crawl/converting-process/src/test/resources/html/work-set/url-412929678 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-475213997 b/crawl/converting-process/src/test/resources/html/work-set/url-475213997 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-475213997 rename to crawl/converting-process/src/test/resources/html/work-set/url-475213997 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-483403121 b/crawl/converting-process/src/test/resources/html/work-set/url-483403121 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-483403121 rename to crawl/converting-process/src/test/resources/html/work-set/url-483403121 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-488667993 b/crawl/converting-process/src/test/resources/html/work-set/url-488667993 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-488667993 rename to crawl/converting-process/src/test/resources/html/work-set/url-488667993 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-50815201 b/crawl/converting-process/src/test/resources/html/work-set/url-50815201 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-50815201 rename to crawl/converting-process/src/test/resources/html/work-set/url-50815201 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-522685905 b/crawl/converting-process/src/test/resources/html/work-set/url-522685905 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-522685905 rename to crawl/converting-process/src/test/resources/html/work-set/url-522685905 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-570714305 b/crawl/converting-process/src/test/resources/html/work-set/url-570714305 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-570714305 rename to crawl/converting-process/src/test/resources/html/work-set/url-570714305 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-58733529 b/crawl/converting-process/src/test/resources/html/work-set/url-58733529 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-58733529 rename to crawl/converting-process/src/test/resources/html/work-set/url-58733529 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-616518304 b/crawl/converting-process/src/test/resources/html/work-set/url-616518304 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-616518304 rename to crawl/converting-process/src/test/resources/html/work-set/url-616518304 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-662169426 b/crawl/converting-process/src/test/resources/html/work-set/url-662169426 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-662169426 rename to crawl/converting-process/src/test/resources/html/work-set/url-662169426 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-677278788 b/crawl/converting-process/src/test/resources/html/work-set/url-677278788 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-677278788 rename to crawl/converting-process/src/test/resources/html/work-set/url-677278788 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-690486170 b/crawl/converting-process/src/test/resources/html/work-set/url-690486170 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-690486170 rename to crawl/converting-process/src/test/resources/html/work-set/url-690486170 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-709693331 b/crawl/converting-process/src/test/resources/html/work-set/url-709693331 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-709693331 rename to crawl/converting-process/src/test/resources/html/work-set/url-709693331 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-734531556 b/crawl/converting-process/src/test/resources/html/work-set/url-734531556 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-734531556 rename to crawl/converting-process/src/test/resources/html/work-set/url-734531556 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-767530276 b/crawl/converting-process/src/test/resources/html/work-set/url-767530276 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-767530276 rename to crawl/converting-process/src/test/resources/html/work-set/url-767530276 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-783154014 b/crawl/converting-process/src/test/resources/html/work-set/url-783154014 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-783154014 rename to crawl/converting-process/src/test/resources/html/work-set/url-783154014 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-796905237 b/crawl/converting-process/src/test/resources/html/work-set/url-796905237 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-796905237 rename to crawl/converting-process/src/test/resources/html/work-set/url-796905237 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-800099955 b/crawl/converting-process/src/test/resources/html/work-set/url-800099955 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-800099955 rename to crawl/converting-process/src/test/resources/html/work-set/url-800099955 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-804101946 b/crawl/converting-process/src/test/resources/html/work-set/url-804101946 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-804101946 rename to crawl/converting-process/src/test/resources/html/work-set/url-804101946 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-830664902 b/crawl/converting-process/src/test/resources/html/work-set/url-830664902 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-830664902 rename to crawl/converting-process/src/test/resources/html/work-set/url-830664902 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-876060686 b/crawl/converting-process/src/test/resources/html/work-set/url-876060686 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-876060686 rename to crawl/converting-process/src/test/resources/html/work-set/url-876060686 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-892584998 b/crawl/converting-process/src/test/resources/html/work-set/url-892584998 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-892584998 rename to crawl/converting-process/src/test/resources/html/work-set/url-892584998 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-942458463 b/crawl/converting-process/src/test/resources/html/work-set/url-942458463 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-942458463 rename to crawl/converting-process/src/test/resources/html/work-set/url-942458463 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-952036171 b/crawl/converting-process/src/test/resources/html/work-set/url-952036171 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-952036171 rename to crawl/converting-process/src/test/resources/html/work-set/url-952036171 diff --git a/marginalia_nu/src/test/resources/html/work-set/url-968207276 b/crawl/converting-process/src/test/resources/html/work-set/url-968207276 similarity index 100% rename from marginalia_nu/src/test/resources/html/work-set/url-968207276 rename to crawl/converting-process/src/test/resources/html/work-set/url-968207276 diff --git a/crawl/crawl-job-extractor-process/build.gradle b/crawl/crawl-job-extractor-process/build.gradle new file mode 100644 index 00000000..0b5d6057 --- /dev/null +++ b/crawl/crawl-job-extractor-process/build.gradle @@ -0,0 +1,47 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'application' + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +application { + mainClass = 'nu.marginalia.crawl.CrawlJobExtractorMain' + applicationName = 'crawl-job-extractor-process' +} + +dependencies { + implementation project(':common:model') + implementation project(':common:service') + implementation project(':crawl:crawling-model') + implementation project(':crawl:common') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.bundles.mariadb + implementation libs.guice + implementation libs.gson + implementation libs.zstd + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java b/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java similarity index 65% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java rename to crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java index 2c3a373b..f0a2fda2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java +++ b/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java @@ -1,29 +1,24 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawl; -import com.github.luben.zstd.ZstdOutputStream; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; -import com.google.gson.Gson; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import org.mariadb.jdbc.Driver; +import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; -import java.io.BufferedOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.file.Path; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; -import java.util.*; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Set; import java.util.stream.Stream; -public class CrawlJobExtractorMain { +public class CrawlJobDomainExtractor { + private static final int MIN_VISIT_COUNT = 1000; + private static final int MAX_VISIT_COUNT = 100000; private static final String specificDomainSql = """ @@ -72,66 +67,22 @@ public class CrawlJobExtractorMain { AND VISITED ; """; - private static final int MIN_VISIT_COUNT = 1000; - private static final int MAX_VISIT_COUNT = 100000; + private final EdgeDomainBlacklistImpl blacklist; - - private final Connection conn; + private final HikariDataSource dataSource; private static final HashFunction hasher = Hashing.murmur3_128(0); - public static void main(String... args) throws SQLException, IOException { - Driver driver = new Driver(); - var outFile = Path.of(args[0]); - - Gson gson = GsonFactory.get(); - String[] targetDomains = Arrays.stream(args).skip(1).toArray(String[]::new); - - - try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { - final var extractor = new CrawlJobExtractorMain(new DatabaseModule().provideConnection()); - final Stream jobs; - - if (targetDomains.length > 0) { - jobs = Arrays.stream(targetDomains).map(EdgeDomain::new).map(extractor::extractDomain); - } else { - jobs = extractor.extractDomains(); - } - - jobs.map(gson::toJson).forEach(out::println); - } + public CrawlJobDomainExtractor(EdgeDomainBlacklistImpl blacklist, HikariDataSource dataSource) { + this.blacklist = blacklist; + this.dataSource = dataSource; } - public static void writeSpec(Path outFile, String domain, List urls) throws IOException { - Gson gson = GsonFactory.get(); - - try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { - var job = new CrawlingSpecification(); - job.crawlDepth = urls.size(); - job.domain = domain; - job.id = createId(new EdgeDomain(domain)); - job.urls = urls; - out.println(gson.toJson(job)); - } - } - - public static void writeSpec(Path outFile, CrawlingSpecification... specs) throws IOException { - Gson gson = GsonFactory.get(); - - try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { - for (var spec : specs) { - out.println(gson.toJson(spec)); - } - } - } - - private record DomainWithId(String domainName, int id) { - } - - private Stream extractDomains() { + public Stream extractDomainsFromQueue() { Set ids = new HashSet<>(1_000_000); - try (var stmtDomains = conn.prepareStatement(domainsSql); + try (var conn = dataSource.getConnection(); + var stmtDomains = conn.prepareStatement(domainsSql); var stmtQueue = conn.prepareStatement(queuedDomainsSql); ) { ResultSet rsp; @@ -157,44 +108,15 @@ public class CrawlJobExtractorMain { .map(this::createCrawlJobForDomain); } - private CrawlingSpecification createCrawlJobForDomain(DomainWithId domainWithId) { - var spec = new CrawlingSpecification(); - spec.id = createId(domainWithId); - spec.domain = domainWithId.domainName; - spec.urls = new ArrayList<>(); - spec.crawlDepth = getCrawlDepth(domainWithId); - - try (var stmt = conn.prepareStatement(urlsSql)) { - stmt.setFetchSize(1000); - stmt.setInt(1, domainWithId.id); - var rsp = stmt.executeQuery(); - - while (rsp.next()) { - spec.urls.add(rsp.getString(1)); - } - } - catch (SQLException ex) { - ex.printStackTrace(); - } - - spec.urls.sort(Comparator.naturalOrder()); - - return spec; - } - - public CrawlJobExtractorMain(HikariDataSource ds) throws SQLException { - blacklist = new EdgeDomainBlacklistImpl(ds); - conn = ds.getConnection(); - } - public CrawlingSpecification extractDomain(EdgeDomain domain) { CrawlingSpecification spec = new CrawlingSpecification(); + spec.domain = domain.toString(); spec.id = createId(domain); spec.urls = new ArrayList<>(1000); - - try (var domainQuery = conn.prepareStatement(specificDomainSql); + try (var conn = dataSource.getConnection(); + var domainQuery = conn.prepareStatement(specificDomainSql); var urlQuery = conn.prepareStatement(urlsSql)) { domainQuery.setString(1, domain.toString()); @@ -222,6 +144,36 @@ public class CrawlJobExtractorMain { return spec; } + private record DomainWithId(String domainName, int id) { + + + } + + private CrawlingSpecification createCrawlJobForDomain(DomainWithId domainWithId) { + var spec = new CrawlingSpecification(); + spec.id = createId(domainWithId); + spec.domain = domainWithId.domainName; + spec.urls = new ArrayList<>(); + spec.crawlDepth = getCrawlDepth(domainWithId); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(urlsSql)) { + stmt.setFetchSize(1000); + stmt.setInt(1, domainWithId.id); + var rsp = stmt.executeQuery(); + + while (rsp.next()) { + spec.urls.add(rsp.getString(1)); + } + } + catch (SQLException ex) { + ex.printStackTrace(); + } + + spec.urls.sort(Comparator.naturalOrder()); + + return spec; + } private static String createId(DomainWithId domainWithId) { return hasher.hashUnencodedChars(domainWithId.domainName).toString(); @@ -232,7 +184,8 @@ public class CrawlJobExtractorMain { } private int getCrawlDepth(DomainWithId domainWithId) { - try (var domainQuery = conn.prepareStatement(visitedUrlsSql)) { + try (var conn = dataSource.getConnection(); + var domainQuery = conn.prepareStatement(visitedUrlsSql)) { domainQuery.setInt(1, domainWithId.id); var rsp = domainQuery.executeQuery(); if (rsp.next()) { @@ -258,4 +211,5 @@ public class CrawlJobExtractorMain { return count; } + } diff --git a/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java b/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java new file mode 100644 index 00000000..0f53b2dc --- /dev/null +++ b/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java @@ -0,0 +1,49 @@ +package nu.marginalia.crawl; + +import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.service.module.DatabaseModule; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; +import java.util.stream.Stream; + +public class CrawlJobExtractorMain { + + public static void main(String... args) throws IOException { + if (args.length == 0) { + System.out.println("Parameters: outputfile.spec [domain1, domain2, ...]"); + System.out.println(); + System.out.println("If no domains are provided, a full crawl spec is created from the database"); + return; + } + + Path outFile = Path.of(args[0]); + if (Files.exists(outFile)) { + System.err.println("Out file " + outFile + " already exists, remove it first!"); + return; + } + + String[] targetDomains = Arrays.copyOfRange(args, 1, args.length); + + try (CrawlJobSpecWriter out = new CrawlJobSpecWriter(outFile)) + { + streamSpecs(targetDomains).forEach(out::accept); + } + } + + private static Stream streamSpecs(String[] targetDomains) { + var ds = new DatabaseModule().provideConnection(); + var domainExtractor = new CrawlJobDomainExtractor(new EdgeDomainBlacklistImpl(ds), ds); + + if (targetDomains.length > 0) { + return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractDomain); + } else { + return domainExtractor.extractDomainsFromQueue(); + } + } + +} diff --git a/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobSpecWriter.java b/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobSpecWriter.java new file mode 100644 index 00000000..f853173f --- /dev/null +++ b/crawl/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobSpecWriter.java @@ -0,0 +1,27 @@ +package nu.marginalia.crawl; + +import com.github.luben.zstd.ZstdOutputStream; +import com.google.gson.Gson; +import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.model.gson.GsonFactory; + +import java.io.*; +import java.nio.file.Path; + +public class CrawlJobSpecWriter implements AutoCloseable { + + private final PrintWriter writer; + private final Gson gson = GsonFactory.get(); + + public CrawlJobSpecWriter(Path fileName) throws IOException { + writer = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(fileName.toFile())))); + } + + public void accept(CrawlingSpecification crawlingSpecification) { + gson.toJson(crawlingSpecification, writer); + } + + public void close() { + writer.close(); + } +} diff --git a/crawl/crawl-job-extractor-process/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java b/crawl/crawl-job-extractor-process/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java new file mode 100644 index 00000000..dc082d0a --- /dev/null +++ b/crawl/crawl-job-extractor-process/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java @@ -0,0 +1,44 @@ +package nu.marginalia.crawl; + +import nu.marginalia.crawling.common.plan.CrawlerSpecificationLoader; +import nu.marginalia.crawling.model.CrawlingSpecification; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class CrawlJobSpecWriterTest { + + Path tempFile; + + @BeforeEach + public void setUp() throws IOException { + tempFile = Files.createTempFile(getClass().getSimpleName(), "tmp"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.delete(tempFile); + } + + @Test + public void testReadWrite() throws IOException { + try (CrawlJobSpecWriter writer = new CrawlJobSpecWriter(tempFile)) { + writer.accept(new CrawlingSpecification("first",1, "test1", List.of("a", "b", "c"))); + writer.accept(new CrawlingSpecification("second",1, "test2", List.of("a", "b", "c", "d"))); + writer.accept(new CrawlingSpecification("third",1, "test3", List.of("a", "b"))); + } + + List outputs = new ArrayList<>(); + CrawlerSpecificationLoader.readInputSpec(tempFile, outputs::add); + + assertEquals(outputs.size(), 3); + } +} diff --git a/crawl/crawling-model/build.gradle b/crawl/crawling-model/build.gradle new file mode 100644 index 00000000..eb72bbda --- /dev/null +++ b/crawl/crawling-model/build.gradle @@ -0,0 +1,47 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':libraries:misc') + implementation project(':api:index-api') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + implementation project(':libraries:language-processing') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.notnull + + implementation libs.gson + implementation libs.zstd + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java rename to crawl/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index cb7aa8f9..49dee5b3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java +++ b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -1,11 +1,13 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling.io; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import jdkoverride.LargeLineBufferedReader; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.model.gson.GsonFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.FileInputStream; import java.io.IOException; @@ -18,7 +20,7 @@ import java.util.concurrent.TimeUnit; public class CrawledDomainReader { private final Gson gson = GsonFactory.get(); - + private final Logger logger = LoggerFactory.getLogger(getClass()); private final ForkJoinPool pool = new ForkJoinPool(6); public CrawledDomainReader() { @@ -61,6 +63,8 @@ public class CrawledDomainReader { return read(path); } catch (Exception ex) { + logger.warn("Failed to read domain", ex); + throw new RuntimeException(ex); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainWriter.java b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainWriter.java rename to crawl/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java index 40dc1f8c..51ffab18 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainWriter.java +++ b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling.io; import com.github.luben.zstd.ZstdOutputStream; import com.google.gson.Gson; import lombok.SneakyThrows; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.model.gson.GsonFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlLogEntry.java b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlLogEntry.java similarity index 61% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlLogEntry.java rename to crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlLogEntry.java index cdad9a70..213bef82 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlLogEntry.java +++ b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlLogEntry.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling.model; +package nu.marginalia.crawling.model; public record CrawlLogEntry(String id, String ts, String path, int cnt) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java rename to crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 3250630f..a76dcb7f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java +++ b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -1,9 +1,8 @@ -package nu.marginalia.wmsa.edge.crawling.model; +package nu.marginalia.crawling.model; import lombok.Builder; import lombok.ToString; -import nu.marginalia.util.bigstring.BigString; -import nu.marginalia.util.bigstring.CompressedBigString; +import nu.marginalia.bigstring.BigString; @Builder @ToString diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDomain.java b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDomain.java rename to crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java index a4c365d7..cfa39479 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDomain.java +++ b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling.model; +package nu.marginalia.crawling.model; import lombok.AllArgsConstructor; import lombok.Builder; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlerDocumentStatus.java b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlerDocumentStatus.java similarity index 76% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlerDocumentStatus.java rename to crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlerDocumentStatus.java index 0a9b0e0a..2369bcc6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlerDocumentStatus.java +++ b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlerDocumentStatus.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling.model; +package nu.marginalia.crawling.model; public enum CrawlerDocumentStatus { OK, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlerDomainStatus.java b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlerDomainStatus.java similarity index 59% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlerDomainStatus.java rename to crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlerDomainStatus.java index 1c22067c..12a31c52 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlerDomainStatus.java +++ b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlerDomainStatus.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling.model; +package nu.marginalia.crawling.model; public enum CrawlerDomainStatus { OK, ERROR, BLOCKED, REDIRECT diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlingSpecification.java b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlingSpecification.java similarity index 62% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlingSpecification.java rename to crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlingSpecification.java index 57298c84..696c5e43 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlingSpecification.java +++ b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlingSpecification.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling.model; +package nu.marginalia.crawling.model; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; @@ -14,4 +14,9 @@ public class CrawlingSpecification { // Don't make this EdgeUrl, EdgeDomain etc. -- we want this plastic to change! public String domain; public List urls; + + @Override + public String toString() { + return String.format(getClass().getSimpleName() + "[" + id + "/" + domain + ": " + crawlDepth + "[ " + urls.size() + "]"); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/SerializableCrawlData.java b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java similarity index 61% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/SerializableCrawlData.java rename to crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java index 015ea743..c9804d54 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/SerializableCrawlData.java +++ b/crawl/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling.model; +package nu.marginalia.crawling.model; public interface SerializableCrawlData { String getSerialIdentifier(); diff --git a/crawl/crawling-process/build.gradle b/crawl/crawling-process/build.gradle new file mode 100644 index 00000000..98f70752 --- /dev/null +++ b/crawl/crawling-process/build.gradle @@ -0,0 +1,62 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'application' + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +application { + mainClass = 'nu.marginalia.crawl.CrawlerMain' + applicationName = 'crawler-process' +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:config') + implementation project(':common:service') + implementation project(':libraries:misc') + implementation project(':api:index-api') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + implementation project(':libraries:language-processing') + implementation project(':crawl:common') + implementation project(':crawl:crawling-model') + implementation project(':crawl:converting-model') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.notnull + implementation libs.guice + implementation libs.gson + implementation libs.zstd + implementation libs.crawlercommons + implementation libs.okhttp3 + implementation libs.jsoup + implementation libs.opencsv + implementation libs.rxjava + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java similarity index 74% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java rename to crawl/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 54283e98..c99f62c6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java +++ b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -1,13 +1,17 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawl; import com.google.gson.Gson; import com.google.gson.GsonBuilder; -import nu.marginalia.wmsa.configuration.UserAgent; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; -import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver; -import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher; -import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import nu.marginalia.UserAgent; +import nu.marginalia.WmsaHome; +import nu.marginalia.crawling.common.AbortMonitor; +import nu.marginalia.crawling.common.WorkLog; +import nu.marginalia.crawling.common.plan.CrawlPlanLoader; +import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; +import nu.marginalia.crawling.io.CrawledDomainWriter; +import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.HttpFetcher; import okhttp3.ConnectionPool; import okhttp3.Dispatcher; import okhttp3.internal.Util; @@ -18,7 +22,6 @@ import java.nio.file.Path; import java.util.concurrent.*; public class CrawlerMain implements AutoCloseable { - public static Gson gson = new GsonBuilder().create(); private final Logger logger = LoggerFactory.getLogger(getClass()); private final EdgeCrawlPlan plan; @@ -36,6 +39,9 @@ public class CrawlerMain implements AutoCloseable { final int poolSize = Integer.getInteger("crawler.pool-size", 512); final int poolQueueSize = 32; + AbortMonitor abortMonitor = AbortMonitor.getInstance(); + Semaphore taskSem = new Semaphore(poolSize); + public CrawlerMain(EdgeCrawlPlan plan) throws Exception { this.plan = plan; this.userAgent = WmsaHome.getUserAgent(); @@ -66,6 +72,35 @@ public class CrawlerMain implements AutoCloseable { System.exit(0); } + public void run() throws InterruptedException { + // First a validation run to ensure the file is all good to parse + logger.info("Validating JSON"); + plan.forEachCrawlingSpecification(unused -> {}); + + logger.info("Let's go"); + + plan.forEachCrawlingSpecification(this::startCrawlTask); + } + + private void startCrawlTask(CrawlingSpecification crawlingSpecification) { + if (abortMonitor.isAlive()) { + try { + taskSem.acquire(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + pool.execute(() -> { + try { + fetchDomain(crawlingSpecification); + } + finally { + taskSem.release(); + } + }); + } + } + private void fetchDomain(CrawlingSpecification specification) { if (workLog.isJobFinished(specification.id)) return; @@ -85,40 +120,6 @@ public class CrawlerMain implements AutoCloseable { } } - public void run() throws InterruptedException { - // First a validation run to ensure the file is all good to parse - - logger.info("Validating JSON"); - plan.forEachCrawlingSpecification(unused -> {}); - - logger.info("Let's go"); - - AbortMonitor abortMonitor = AbortMonitor.getInstance(); - - Semaphore taskSem = new Semaphore(poolSize); - - plan.forEachCrawlingSpecification(spec -> { - if (abortMonitor.isAlive()) { - try { - taskSem.acquire(); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - - pool.execute(() -> { - try { - fetchDomain(spec); - } - finally { - taskSem.release(); - } - }); - } - }); - - - } - public void close() throws Exception { logger.info("Awaiting termination"); pool.shutdown(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/Cookies.java b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/Cookies.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/Cookies.java rename to crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/Cookies.java index b19478ea..7b43321e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/Cookies.java +++ b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/Cookies.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling.retreival; +package nu.marginalia.crawl.retreival; import okhttp3.Cookie; import okhttp3.CookieJar; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java rename to crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 9e4242c5..b990ddc9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -1,16 +1,15 @@ -package nu.marginalia.wmsa.edge.crawling.retreival; +package nu.marginalia.crawl.retreival; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; -import nu.marginalia.wmsa.edge.crawling.CrawledDomainWriter; -import nu.marginalia.wmsa.edge.crawling.blocklist.GeoIpBlocklist; -import nu.marginalia.wmsa.edge.crawling.blocklist.IpBlockList; -import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist; -import nu.marginalia.wmsa.edge.crawling.model.*; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.crawling.common.blocklist.GeoIpBlocklist; +import nu.marginalia.crawling.common.blocklist.IpBlockList; +import nu.marginalia.crawling.common.blocklist.UrlBlocklist; +import nu.marginalia.crawling.common.link.LinkParser; +import nu.marginalia.crawling.model.*; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/FastTerminatingSocketFactory.java b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/FastTerminatingSocketFactory.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/FastTerminatingSocketFactory.java rename to crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/FastTerminatingSocketFactory.java index f9a889b1..8679f09d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/FastTerminatingSocketFactory.java +++ b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/FastTerminatingSocketFactory.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling.retreival; +package nu.marginalia.crawl.retreival; import javax.net.SocketFactory; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/HttpFetcher.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java rename to crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/HttpFetcher.java index b7074825..141d7970 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/HttpFetcher.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling.retreival; +package nu.marginalia.crawl.retreival; import com.google.inject.Inject; import com.google.inject.name.Named; @@ -7,14 +7,14 @@ import crawlercommons.robots.SimpleRobotRulesParser; import lombok.AllArgsConstructor; import lombok.SneakyThrows; import lombok.ToString; -import nu.marginalia.util.bigstring.BigString; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; -import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic; -import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeContentType; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.crawl.EdgeContentType; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.bigstring.BigString; +import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; +import nu.marginalia.crawl.retreival.logic.ContentTypeParser; import okhttp3.*; import org.apache.commons.io.input.BOMInputStream; import org.slf4j.Logger; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpRedirectResolver.java b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/HttpRedirectResolver.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpRedirectResolver.java rename to crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/HttpRedirectResolver.java index 5857c19b..48a0de91 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpRedirectResolver.java +++ b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/HttpRedirectResolver.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.crawling.retreival; +package nu.marginalia.crawl.retreival; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; import io.reactivex.rxjava3.core.Observable; import lombok.SneakyThrows; -import nu.marginalia.wmsa.client.exception.NetworkException; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.crawling.common.link.LinkParser; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.client.exception.NetworkException; import okhttp3.OkHttpClient; import okhttp3.Request; import okhttp3.Response; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/NoSecuritySSL.java b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/NoSecuritySSL.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/NoSecuritySSL.java rename to crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/NoSecuritySSL.java index b276bd9b..225bea97 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/NoSecuritySSL.java +++ b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/NoSecuritySSL.java @@ -1,9 +1,8 @@ -package nu.marginalia.wmsa.edge.crawling.retreival; +package nu.marginalia.crawl.retreival; import lombok.SneakyThrows; import javax.net.ssl.*; -import java.security.cert.CertificateException; import java.security.cert.X509Certificate; public class NoSecuritySSL { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/RateLimitException.java b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/RateLimitException.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/RateLimitException.java rename to crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/RateLimitException.java index ac28dca9..5e0c57dd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/RateLimitException.java +++ b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/RateLimitException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling.retreival; +package nu.marginalia.crawl.retreival; public class RateLimitException extends Exception { private final String retryAfter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeLogic.java b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeLogic.java rename to crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java index 9d05026c..c5860913 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeLogic.java +++ b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.crawling.retreival.logic; +package nu.marginalia.crawl.retreival.logic; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.model.EdgeUrl; import java.util.List; import java.util.Set; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeParser.java b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeParser.java rename to crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java index 2f3359f3..62d21ba9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeParser.java +++ b/crawl/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.crawling.retreival.logic; +package nu.marginalia.crawl.retreival.logic; import crawlercommons.mimetypes.MimeTypeDetector; -import nu.marginalia.wmsa.edge.model.crawl.EdgeContentType; +import nu.marginalia.model.crawl.EdgeContentType; import org.jsoup.Jsoup; import java.util.Arrays; diff --git a/marginalia_nu/src/main/resources/ip-banned-cidr.txt b/crawl/crawling-process/src/main/resources/ip-banned-cidr.txt similarity index 100% rename from marginalia_nu/src/main/resources/ip-banned-cidr.txt rename to crawl/crawling-process/src/main/resources/ip-banned-cidr.txt diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/CrawlPlanLoaderTest.java b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/CrawlPlanLoaderTest.java similarity index 89% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/CrawlPlanLoaderTest.java rename to crawl/crawling-process/src/test/java/nu/marginalia/crawling/CrawlPlanLoaderTest.java index 40b77484..000fd5ee 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/CrawlPlanLoaderTest.java +++ b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/CrawlPlanLoaderTest.java @@ -1,5 +1,6 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling; +import nu.marginalia.crawling.common.plan.CrawlPlanLoader; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -8,7 +9,7 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; class CrawlPlanLoaderTest { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/DomainCrawlerRobotsTxtTest.java b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/DomainCrawlerRobotsTxtTest.java similarity index 96% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/DomainCrawlerRobotsTxtTest.java rename to crawl/crawling-process/src/test/java/nu/marginalia/crawling/DomainCrawlerRobotsTxtTest.java index ef77e373..0194fb01 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/DomainCrawlerRobotsTxtTest.java +++ b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/DomainCrawlerRobotsTxtTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling; import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRulesParser; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HtmlTagCleanerTest.java b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/HtmlTagCleanerTest.java similarity index 88% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HtmlTagCleanerTest.java rename to crawl/crawling-process/src/test/java/nu/marginalia/crawling/HtmlTagCleanerTest.java index 1b80c801..07f72179 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HtmlTagCleanerTest.java +++ b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/HtmlTagCleanerTest.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling; -import nu.marginalia.util.language.processing.HtmlTagCleaner; +import nu.marginalia.language.encoding.HtmlTagCleaner; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java similarity index 88% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java rename to crawl/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java index 653294f8..c9cc8b0b 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java +++ b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher; -import nu.marginalia.wmsa.edge.crawling.retreival.HttpRedirectResolver; -import nu.marginalia.wmsa.edge.crawling.retreival.RateLimitException; -import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.crawl.retreival.HttpFetcher; +import nu.marginalia.crawl.retreival.HttpRedirectResolver; +import nu.marginalia.crawl.retreival.RateLimitException; +import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; +import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/LanguageFilterTest.java similarity index 92% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java rename to crawl/crawling-process/src/test/java/nu/marginalia/crawling/LanguageFilterTest.java index 50730390..694810ba 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java +++ b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/LanguageFilterTest.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling; -import nu.marginalia.util.language.LanguageFilter; +import nu.marginalia.language.LanguageFilter; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/LinkParserTest.java similarity index 94% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java rename to crawl/crawling-process/src/test/java/nu/marginalia/crawling/LinkParserTest.java index 065310f7..da3059f2 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java +++ b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/LinkParserTest.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.crawling.common.link.LinkParser; +import nu.marginalia.model.EdgeUrl; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/RssCrawlerTest.java b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java similarity index 92% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/RssCrawlerTest.java rename to crawl/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java index cf64ef6d..25a40ece 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/RssCrawlerTest.java +++ b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.crawling.common.link.LinkParser; +import nu.marginalia.model.EdgeUrl; import org.jsoup.Jsoup; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/UrlBlocklistTest.java similarity index 90% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java rename to crawl/crawling-process/src/test/java/nu/marginalia/crawling/UrlBlocklistTest.java index 2460987a..cd799540 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java +++ b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/UrlBlocklistTest.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling; -import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.crawling.common.blocklist.UrlBlocklist; +import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Test; import java.net.URISyntaxException; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/WorkLogTest.java b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java similarity index 88% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/WorkLogTest.java rename to crawl/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java index bf5a6bdb..62d86e87 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/WorkLogTest.java +++ b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java @@ -1,5 +1,6 @@ -package nu.marginalia.wmsa.edge.crawling; +package nu.marginalia.crawling; +import nu.marginalia.crawling.common.WorkLog; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -8,7 +9,8 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; class WorkLogTest { Path outFile; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiverTest.java b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java similarity index 75% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiverTest.java rename to crawl/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index c3f558e8..73dd832b 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiverTest.java +++ b/crawl/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -1,8 +1,10 @@ -package nu.marginalia.wmsa.edge.crawling.retreival; +package nu.marginalia.crawling.retreival; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; -import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; -import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData; +import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.HttpFetcher; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.crawling.model.SerializableCrawlData; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; diff --git a/crawl/experimental/build.gradle b/crawl/experimental/build.gradle new file mode 100644 index 00000000..68e334ab --- /dev/null +++ b/crawl/experimental/build.gradle @@ -0,0 +1,50 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:config') + implementation project(':common:service') + implementation project(':libraries:misc') + implementation project(':api:index-api') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + implementation project(':libraries:language-processing') + implementation project(':crawl:common') + implementation project(':crawl:crawling-model') + implementation project(':crawl:converting-process') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.guice + implementation libs.jsoup + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java b/crawl/experimental/src/main/java/nu/marginalia/experimental/AdblockTesterTool.java similarity index 67% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java rename to crawl/experimental/src/main/java/nu/marginalia/experimental/AdblockTesterTool.java index b97fc27b..61f91e52 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java +++ b/crawl/experimental/src/main/java/nu/marginalia/experimental/AdblockTesterTool.java @@ -1,17 +1,17 @@ -package nu.marginalia.wmsa.edge.tools; +package nu.marginalia.experimental; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; -import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; -import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import nu.marginalia.converting.processor.DocumentProcessor; +import nu.marginalia.converting.processor.logic.topic.AdblockSimulator; +import nu.marginalia.crawling.common.plan.CrawlPlanLoader; +import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.io.IOException; import java.nio.file.Path; -import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType; public class AdblockTesterTool { @@ -40,7 +40,7 @@ public class AdblockTesterTool { private static void processDomain(CrawledDomain domain) { if (domain.doc == null) return; for (var doc : domain.doc) { - if (isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) { + if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) { processDocument(doc); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java b/crawl/experimental/src/main/java/nu/marginalia/experimental/ConverterLogicTestTool.java similarity index 77% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java rename to crawl/experimental/src/main/java/nu/marginalia/experimental/ConverterLogicTestTool.java index 78d90ccb..f9d15b81 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java +++ b/crawl/experimental/src/main/java/nu/marginalia/experimental/ConverterLogicTestTool.java @@ -1,19 +1,19 @@ -package nu.marginalia.wmsa.edge.tools; +package nu.marginalia.experimental; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.converting.ConverterModule; -import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; -import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.GoogleAnwersSpamDetector; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector; -import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import nu.marginalia.converting.ConverterModule; +import nu.marginalia.crawling.common.plan.CrawlPlanLoader; +import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; +import nu.marginalia.converting.processor.logic.DomPruningFilter; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.WmsaHome; +import nu.marginalia.converting.processor.DomainProcessor; +import nu.marginalia.converting.processor.logic.topic.GoogleAnwersSpamDetector; +import nu.marginalia.converting.processor.logic.topic.RecipeDetector; +import nu.marginalia.converting.processor.logic.topic.TextileCraftDetector; +import nu.marginalia.converting.processor.logic.topic.WoodworkingDetector; import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java b/crawl/experimental/src/main/java/nu/marginalia/experimental/CrawlDataExtractorTool.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java rename to crawl/experimental/src/main/java/nu/marginalia/experimental/CrawlDataExtractorTool.java index cbe59e60..a4177562 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java +++ b/crawl/experimental/src/main/java/nu/marginalia/experimental/CrawlDataExtractorTool.java @@ -1,12 +1,13 @@ -package nu.marginalia.wmsa.edge.tools; +package nu.marginalia.experimental; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; -import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; -import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import nu.marginalia.converting.processor.DocumentProcessor; +import nu.marginalia.converting.processor.logic.topic.AdblockSimulator; +import nu.marginalia.crawling.common.plan.CrawlPlanLoader; +import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.service.module.DatabaseModule; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -17,7 +18,6 @@ import java.util.HashSet; import java.util.Set; import java.util.concurrent.*; -import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType; public class CrawlDataExtractorTool { private static final AdblockSimulator abs; @@ -76,7 +76,7 @@ public class CrawlDataExtractorTool { if (!urls.contains(doc.url)) continue; - if (isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) { + if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) { processDocument(doc); } } diff --git a/crawl/loading-process/build.gradle b/crawl/loading-process/build.gradle new file mode 100644 index 00000000..c57ec73d --- /dev/null +++ b/crawl/loading-process/build.gradle @@ -0,0 +1,68 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'application' + id 'jvm-test-suite' +} +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +application { + mainClass = 'nu.marginalia.loading.LoaderMain' + applicationName = 'loader-process' +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':api:index-api') + implementation project(':common:model') + implementation project(':common:config') + implementation project(':common:service') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + implementation project(':index:lexicon') + implementation project(':index:index-journal') + implementation project(':libraries:language-processing') + implementation project(':libraries:misc') + + testImplementation project(':services-core:search-service') + + implementation project(':crawl:common') + implementation project(':crawl:crawling-model') + implementation project(':crawl:converting-model') + + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.guice + implementation libs.gson + implementation libs.commons.lang3 + implementation libs.zstd + implementation libs.trove + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + + testImplementation libs.bundles.selenium + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java rename to crawl/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java index 4efe95e3..6b9dfbbd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java @@ -1,11 +1,10 @@ -package nu.marginalia.wmsa.edge.converting; +package nu.marginalia.loading; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import com.google.gson.JsonParseException; -import crawlercommons.utils.Strings; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.InstructionTag; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,7 +35,7 @@ public class ConvertedDomainReader { if (line == null) { break; } - if (Strings.isBlank(line)) { + if (line.isBlank()) { continue; } var parts= line.split(" ", 2); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java similarity index 68% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java rename to crawl/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 97621467..7e702776 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -1,37 +1,38 @@ -package nu.marginalia.wmsa.edge.converting; +package nu.marginalia.loading; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.loader.Loader; -import nu.marginalia.wmsa.edge.converting.loader.LoaderFactory; -import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import nu.marginalia.wmsa.edge.crawling.WorkLog; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import nu.marginalia.crawling.common.TaskStats; +import nu.marginalia.crawling.common.WorkLog; +import nu.marginalia.crawling.common.plan.CrawlPlanLoader; +import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; +import nu.marginalia.loading.loader.IndexLoadKeywords; +import nu.marginalia.loading.loader.Loader; +import nu.marginalia.loading.loader.LoaderFactory; +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.service.module.DatabaseModule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; +import java.sql.SQLException; import java.util.List; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; public class LoaderMain { - - private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class); private final EdgeCrawlPlan plan; private final ConvertedDomainReader instructionsReader; private final LoaderFactory loaderFactory; - private final EdgeIndexClient indexClient; + private final IndexLoadKeywords indexLoadKeywords; private volatile boolean running = true; final Thread processorThread = new Thread(this::processor, "Processor Thread"); @@ -41,10 +42,13 @@ public class LoaderMain { System.err.println("Arguments: crawl-plan.yaml"); System.exit(0); } + + new org.mariadb.jdbc.Driver(); + var plan = new CrawlPlanLoader().load(Path.of(args[0])); Injector injector = Guice.createInjector( - new ConverterModule(plan), + new LoaderModule(plan), new DatabaseModule() ); @@ -55,17 +59,40 @@ public class LoaderMain { @Inject public LoaderMain(EdgeCrawlPlan plan, ConvertedDomainReader instructionsReader, - LoaderFactory loaderFactory, - EdgeIndexClient indexClient) { + HikariDataSource dataSource, + LoaderFactory loaderFactory, IndexLoadKeywords indexLoadKeywords) { this.plan = plan; this.instructionsReader = instructionsReader; this.loaderFactory = loaderFactory; - this.indexClient = indexClient; + this.indexLoadKeywords = indexLoadKeywords; + nukeTables(dataSource); + + Runtime.getRuntime().addShutdownHook(new Thread(this::shutDownIndex)); processorThread.start(); } + private void nukeTables(HikariDataSource dataSource) { + try (var conn = dataSource.getConnection(); + var stmt = conn.createStatement()) { + stmt.execute("SET FOREIGN_KEY_CHECKS = 0"); + stmt.execute("TRUNCATE TABLE EC_PAGE_DATA"); + stmt.execute("TRUNCATE TABLE EC_URL"); + stmt.execute("TRUNCATE TABLE EC_DOMAIN_LINK"); + stmt.execute("SET FOREIGN_KEY_CHECKS = 1"); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + @SneakyThrows + private void shutDownIndex() { + // This must run otherwise the journal doesn't get a proper header + indexLoadKeywords.close(); + } + @SneakyThrows public void run() { var logFile = plan.process.getLogFile(); @@ -80,7 +107,6 @@ public class LoaderMain { running = false; processorThread.join(); - indexClient.close(); System.exit(0); } @@ -135,6 +161,7 @@ public class LoaderMain { } catch (InterruptedException e) { throw new RuntimeException(e); } + } } diff --git a/crawl/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java new file mode 100644 index 00000000..14a7389a --- /dev/null +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java @@ -0,0 +1,38 @@ +package nu.marginalia.loading; + +import com.google.gson.Gson; +import com.google.inject.AbstractModule; +import com.google.inject.name.Names; +import nu.marginalia.LanguageModels; +import nu.marginalia.WmsaHome; +import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.service.SearchServiceDescriptors; +import nu.marginalia.service.descriptor.ServiceDescriptors; + +import java.nio.file.Path; + +public class LoaderModule extends AbstractModule { + + private final EdgeCrawlPlan plan; + + public LoaderModule(EdgeCrawlPlan plan) { + this.plan = plan; + } + + public void configure() { + bind(EdgeCrawlPlan.class).toInstance(plan); + + bind(ServiceDescriptors.class).toInstance(SearchServiceDescriptors.descriptors); + + bind(Gson.class).toInstance(createGson()); + + bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path", "/vol"))); + bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); + } + + private Gson createGson() { + return GsonFactory.get(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java similarity index 63% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java rename to crawl/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java index d12aa7ea..e3f1c485 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java @@ -1,13 +1,12 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loading.loader; import com.google.inject.Inject; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeId; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.client.Context; +import nu.marginalia.model.crawl.DocumentKeywords; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.EdgeId; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,17 +17,16 @@ public class IndexLoadKeywords implements Runnable { private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class); private final LinkedBlockingQueue insertQueue = new LinkedBlockingQueue<>(32); - private final EdgeIndexWriterClient client; + private final LoaderIndexJournalWriter client; private record InsertTask(int urlId, int domainId, EdgePageDocumentsMetadata metadata, DocumentKeywords wordSet) {} private final Thread runThread; + private volatile boolean canceled = false; - private static final int index = Integer.getInteger("keyword-index", 1); - @Inject - public IndexLoadKeywords(EdgeIndexWriterClient client) { + public IndexLoadKeywords(LoaderIndexJournalWriter client) { this.client = client; runThread = new Thread(this, getClass().getSimpleName()); runThread.start(); @@ -39,14 +37,17 @@ public class IndexLoadKeywords implements Runnable { while (!canceled) { var data = insertQueue.poll(1, TimeUnit.SECONDS); if (data != null) { - client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet, index); + client.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet); } } } - public void close() throws InterruptedException { - canceled = true; - runThread.join(); + public void close() throws Exception { + if (!canceled) { + canceled = true; + runThread.join(); + client.close(); + } } public void load(LoaderData loaderData, EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) throws InterruptedException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java rename to crawl/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java index ba55ea10..e0a075f3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java @@ -1,14 +1,15 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loading.loader; -import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import lombok.SneakyThrows; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.model.crawl.DocumentKeywords; +import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -141,4 +142,13 @@ public class Loader implements Interpreter { sqlLoadProcessedDocument.load(data, processedDocumentList); sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); } + + public void close() { + try { + indexLoadKeywords.close(); + } + catch (Exception ex) { + logger.error("Error when closing the index loader", ex); + } + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/LoaderData.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/LoaderData.java rename to crawl/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java index 5c9dc4a1..570cb579 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/LoaderData.java +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loading.loader; import gnu.trove.map.hash.TObjectIntHashMap; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; public class LoaderData { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/LoaderFactory.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/LoaderFactory.java rename to crawl/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java index f92319aa..d4a24a9b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/LoaderFactory.java +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loading.loader; import com.google.inject.Inject; diff --git a/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java new file mode 100644 index 00000000..68ef4f4e --- /dev/null +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java @@ -0,0 +1,87 @@ +package nu.marginalia.loading.loader; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.google.inject.name.Named; +import nu.marginalia.dict.DictionaryMap; +import nu.marginalia.dict.OffHeapDictionaryHashMap; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; +import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.lexicon.KeywordLexicon; +import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.model.crawl.DocumentKeywords; +import nu.marginalia.util.KeywordListChunker; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.EdgeId; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Arrays; + +@Singleton +public class LoaderIndexJournalWriter { + + private final KeywordLexicon lexicon; + private final IndexJournalWriter indexWriter; + private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class); + + @Inject + public LoaderIndexJournalWriter(@Named("local-index-path") Path path) throws IOException { + + var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile()); + lexicon = new KeywordLexicon(lexiconJournal); + indexWriter = new IndexJournalWriterImpl(lexicon, path.resolve("index.dat")); + } + + public void putWords(EdgeId domain, EdgeId url, + EdgePageDocumentsMetadata metadata, + DocumentKeywords wordSet) { + if (wordSet.keywords().length == 0) + return; + + if (domain.id() <= 0 || url.id() <= 0) { + logger.warn("Bad ID: {}:{}", domain, url); + return; + } + + for (var chunk : KeywordListChunker.chopList(wordSet, IndexJournalEntryData.MAX_LENGTH)) { + + var entry = new IndexJournalEntryData(getOrInsertWordIds(chunk.keywords(), chunk.metadata())); + var header = new IndexJournalEntryHeader(domain, url, metadata.encode()); + + indexWriter.put(header, entry); + } + + } + + private long[] getOrInsertWordIds(String[] words, long[] meta) { + long[] ids = new long[words.length*2]; + int putIdx = 0; + + for (int i = 0; i < words.length; i++) { + String word = words[i]; + + long id = lexicon.getOrInsert(word); + if (id != OffHeapDictionaryHashMap.NO_VALUE) { + ids[putIdx++] = id; + ids[putIdx++] = meta[i]; + } + } + + if (putIdx != words.length*2) { + ids = Arrays.copyOf(ids, putIdx); + } + return ids; + } + + public void close() throws Exception { + indexWriter.close(); + lexicon.close(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainLinks.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java rename to crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainLinks.java index 1ce0035a..256b2712 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinks.java +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainLinks.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loading.loader; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; +import nu.marginalia.converting.instruction.instructions.DomainLink; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomains.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java rename to crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomains.java index b674b550..5c441c2f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomains.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loading.loader; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java rename to crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java index fac60a74..3b910517 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loading.loader; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,7 +35,7 @@ public class SqlLoadProcessedDocument { IN FEATURES INT, IN STANDARD VARCHAR(32), IN QUALITY DOUBLE, - IN HASH INT, + IN HASH BIGINT, IN PUB_YEAR SMALLINT) BEGIN SET FOREIGN_KEY_CHECKS=0; @@ -83,7 +83,7 @@ public class SqlLoadProcessedDocument { stmt.setInt(6, doc.htmlFeatures()); stmt.setString(7, doc.standard().name()); stmt.setDouble(8, doc.quality()); - stmt.setInt(9, (int) doc.hash()); + stmt.setLong(9, doc.hash()); if (doc.pubYear() != null) { stmt.setShort(10, (short) doc.pubYear().intValue()); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java rename to crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java index 09a0cc0a..3b2304a9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomain.java +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loading.loader; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java rename to crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java index 6f835863..1cd191f6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/crawl/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loading.loader; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java b/crawl/loading-process/src/test/java/nu/marginalia/loader/DbTestUtil.java similarity index 93% rename from marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java rename to crawl/loading-process/src/test/java/nu/marginalia/loader/DbTestUtil.java index ae759dcb..1c3a71d2 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java +++ b/crawl/loading-process/src/test/java/nu/marginalia/loader/DbTestUtil.java @@ -1,4 +1,4 @@ -package nu.marginalia.util; +package nu.marginalia.loader; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; @@ -6,11 +6,11 @@ import lombok.SneakyThrows; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class TestUtil { +public class DbTestUtil { private static final int TEST_PORT_BASE = 6000; private static final int TEST_PORT_RANGE = 2000; - private final static Logger logger = LoggerFactory.getLogger(TestUtil.class); + private final static Logger logger = LoggerFactory.getLogger(DbTestUtil.class); public static int getPort() { return TEST_PORT_BASE + (int)(TEST_PORT_RANGE * Math.random()); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java b/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java similarity index 80% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java rename to crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java index 249bb160..8c2bbe95 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java +++ b/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java @@ -1,9 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loader; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.loading.loader.LoaderData; +import nu.marginalia.loading.loader.SqlLoadDomainLinks; +import nu.marginalia.loading.loader.SqlLoadDomains; +import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.model.EdgeDomain; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; @@ -11,6 +13,7 @@ import org.junit.jupiter.api.Test; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; + @Tag("slow") @Testcontainers class SqlLoadDomainLinksTest { @@ -26,7 +29,7 @@ class SqlLoadDomainLinksTest { LoaderData loaderData; @BeforeEach public void setUp() { - dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); + dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); var loadDomains = new SqlLoadDomains(dataSource); loaderData = new LoaderData(10); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java b/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java similarity index 82% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java rename to crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java index c57c5706..90c534ad 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java +++ b/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java @@ -1,7 +1,8 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loader; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.loading.loader.LoaderData; +import nu.marginalia.loading.loader.SqlLoadDomains; +import nu.marginalia.model.EdgeDomain; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import org.testcontainers.containers.MariaDBContainer; @@ -24,7 +25,7 @@ class SqlLoadDomainsTest { @Test public void loadDomain() { - try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) { + try (var dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl());) { var loadDomains = new SqlLoadDomains(dataSource); var loaderData = new LoaderData(10); @@ -39,7 +40,7 @@ class SqlLoadDomainsTest { @Test public void loadDomains() { - try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) { + try (var dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl());) { var loadDomains = new SqlLoadDomains(dataSource); var loaderData = new LoaderData(10); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java b/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java similarity index 67% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java rename to crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java index 54e4eccb..51752127 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java +++ b/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java @@ -1,19 +1,19 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loader; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDaoImpl; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; -import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; -import nu.marginalia.wmsa.edge.model.id.EdgeIdArray; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.search.db.DbUrlDetailsQuery; +import nu.marginalia.loading.loader.LoaderData; +import nu.marginalia.loading.loader.SqlLoadDomains; +import nu.marginalia.loading.loader.SqlLoadProcessedDocument; +import nu.marginalia.loading.loader.SqlLoadUrls; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.EdgeHtmlStandard; +import nu.marginalia.model.crawl.EdgeUrlState; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.id.EdgeIdArray; +import org.junit.jupiter.api.*; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; @@ -38,12 +38,12 @@ class SqlLoadProcessedDocumentTest { HikariDataSource dataSource; LoaderData loaderData; - EdgeDataStoreDaoImpl dataStoreDao; + DbUrlDetailsQuery dbUrlDetailsQuery; @BeforeEach public void setUp() throws URISyntaxException { - dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - dataStoreDao = new EdgeDataStoreDaoImpl(dataSource); + dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); + dbUrlDetailsQuery = new DbUrlDetailsQuery(dataSource); var loadDomains = new SqlLoadDomains(dataSource); var loadUrls = new SqlLoadUrls(dataSource); @@ -59,7 +59,6 @@ class SqlLoadProcessedDocumentTest { @AfterEach public void tearDown() { - dataStoreDao.clearCaches(); dataSource.close(); } @@ -81,8 +80,8 @@ class SqlLoadProcessedDocumentTest { null ))); - var details = dataStoreDao.getUrlDetailsMulti(new EdgeIdArray<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/")))); - assertEquals(1, details.size()); + var details = dbUrlDetailsQuery.getUrlDetailsMulti(new EdgeIdArray<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/")))); + Assertions.assertEquals(1, details.size()); var urlDetails = details.get(0); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java b/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java similarity index 80% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java rename to crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java index 000b0923..82f38c23 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java +++ b/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java @@ -1,10 +1,12 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loader; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.loading.loader.LoaderData; +import nu.marginalia.loading.loader.SqlLoadDomains; +import nu.marginalia.loading.loader.SqlLoadProcessedDomain; +import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; @@ -29,7 +31,7 @@ class SqlLoadProcessedDomainTest { @BeforeEach public void setUp() { - dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); + dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); var loadDomains = new SqlLoadDomains(dataSource); loaderData = new LoaderData(10); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java b/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java similarity index 81% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java rename to crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java index 84d8d586..fe8c7847 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java +++ b/crawl/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java @@ -1,9 +1,11 @@ -package nu.marginalia.wmsa.edge.converting.loader; +package nu.marginalia.loader; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.loading.loader.LoaderData; +import nu.marginalia.loading.loader.SqlLoadDomains; +import nu.marginalia.loading.loader.SqlLoadUrls; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; @@ -29,7 +31,7 @@ class SqlLoadUrlsTest { LoaderData loaderData; @BeforeEach public void setUp() { - dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl()); + dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); var loadDomains = new SqlLoadDomains(dataSource); loaderData = new LoaderData(10); diff --git a/crawl/readme.md b/crawl/readme.md new file mode 100644 index 00000000..c5d3016e --- /dev/null +++ b/crawl/readme.md @@ -0,0 +1,74 @@ +# Crawl + +## 1. Crawl Job Extractor + +The [crawl-job-extractor-process](crawl-job-extractor-process/) creates a crawl job specification +based on the content in the database. + +## 2. Crawl Process + +The [crawling-process](crawling-process/) fetches website contents and saves them +as compressed JSON models described in [crawling-model](crawling-model/). + +## 3. Converting Process + +The [converting-process](converting-process/) reads crawl data from the crawling step and +processes them, extracting keywords and metadata and saves them as compressed JSON models +described in [converting-model](converting-model/). + +## 4. Loading Process + +The [loading-process](loading-process/) reads the processed data and creates an index journal +and lexicon, and loads domains and addresses into the MariaDB-database. + +## Overview + +Schematically the crawling and loading process looks like this: + +``` + //====================\\ + || Compressed JSON: || Specifications + || ID, Domain, Urls[] || File + || ID, Domain, Urls[] || + || ID, Domain, Urls[] || + || ... || + \\====================// + | + +-----------+ + | CRAWLING | Fetch each URL and + | STEP | output to file + +-----------+ + | + //========================\\ + || Compressed JSON: || Crawl + || Status, HTML[], ... || Files + || Status, HTML[], ... || + || Status, HTML[], ... || + || ... || + \\========================// + | + +------------+ + | CONVERTING | Analyze HTML and + | STEP | extract keywords + +------------+ features, links, URLs + | + //==================\\ + || Compressed JSON: || Processed + || URLs[] || Files + || Domains[] || + || Links[] || + || Keywords[] || + || ... || + || URLs[] || + || Domains[] || + || Links[] || + || Keywords[] || + || ... || + \\==================// + | + +------------+ + | LOADING | Insert URLs in DB + | STEP | Insert keywords in Index + +------------+ + +``` \ No newline at end of file diff --git a/doc/language-models.md b/doc/language-models.md deleted file mode 100644 index c5803bc5..00000000 --- a/doc/language-models.md +++ /dev/null @@ -1,15 +0,0 @@ -# Language Models - -## For Tests - -Many tests require language models to work, -download them from [https://downloads.marginalia.nu/](https://downloads.marginalia.nu/), -and put them somewhere. Then set the environment -variable ```LANGUAGE_MODELS_HOME``` to point to this directory. - -Alternatively, patch ```nu.marginalia.util.TestLanguageModels``` to -default to where you've put them. - -## For Production - -TBW \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..1e51ac22 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,128 @@ +x-svc: &service + env_file: + - "env/service.env" + volumes: + - vol:/vol + - conf:/wmsa/conf:ro + - model:/wmsa/model + - logs:/var/log/wmsa + networks: + - wmsa + depends_on: + - mariadb + +services: + index-service: + <<: *service + image: "marginalia.nu/index-service" + container_name: "index-service" + ports: + - "127.0.0.1:5021:5021/tcp" + - "127.0.0.1:4021:5000" + - "127.0.0.1:7021:4000" + search-service: + <<: *service + image: "marginalia.nu/search-service" + container_name: "search-service" + ports: + - "127.0.0.1:5023:5023" + - "127.0.0.1:4023:5000" + - "127.0.0.1:7023:4000" + depends_on: + - index-service + assistant-service: + <<: *service + image: "marginalia.nu/assistant-service" + container_name: "assistant-service" + ports: + - "127.0.0.1:5025:5025" + - "127.0.0.1:4025:5000" + - "127.0.0.1:7025:4000" + depends_on: + - mariadb + api-service: + <<: *service + image: "marginalia.nu/api-service" + container_name: "api-service" + ports: + - "127.0.0.1:5004:5025" + - "127.0.0.1:4004:5000" + - "127.0.0.1:7004:4000" + depends_on: + - mariadb + dating-service: + <<: *service + image: "marginalia.nu/dating-service" + container_name: "dating-service" + ports: + - "127.0.0.1:5070:5070" + - "127.0.0.1:4070:5000" + - "127.0.0.1:7070:4000" + depends_on: + - mariadb + explorer-service: + <<: *service + image: "marginalia.nu/explorer-service" + container_name: "explorer-service" + ports: + - "127.0.0.1:5071:5071" + - "127.0.0.1:4071:5000" + - "127.0.0.1:7071:4000" + depends_on: + - mariadb + mariadb: + image: "mariadb/server:10.3" + container_name: "mariadb" + env_file: "env/mariadb.env" + command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci'] + ports: + - "127.0.0.1:3306:3306/tcp" + volumes: + - db:/var/lib/mysql + - "./common/model/src/main/resources/sql/edge-crawler-cache.sql:/docker-entrypoint-initdb.d/init.sql" + networks: + - wmsa + nginx-gw: + image: "nginx" + container_name: "nginx-gw" + ports: + - "127.0.0.1:8080:80" + volumes: + - "./run/nginx-site.conf:/etc/nginx/conf.d/default.conf" + networks: + - wmsa + depends_on: + - search-service +networks: + wmsa: +volumes: + db: + driver: local + driver_opts: + type: none + o: bind + device: run/db + vol: + driver: local + driver_opts: + type: none + o: bind + device: run/vol + logs: + driver: local + driver_opts: + type: none + o: bind + device: run/logs + model: + driver: local + driver_opts: + type: none + o: bind + device: run/model + conf: + driver: local + driver_opts: + type: none + o: bind + device: run/conf \ No newline at end of file diff --git a/docker-service.gradle b/docker-service.gradle new file mode 100644 index 00000000..4f8605dc --- /dev/null +++ b/docker-service.gradle @@ -0,0 +1,41 @@ +import java.nio.file.Files + +ext { + dockerImage='openjdk:17-slim' + serviceJvmOpts='-Dservice-host=0.0.0.0 -ea -Dsmall-ram=true ${wmsa_jvm_param} -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=4000 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false' + serviceToolOpts='-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5000' +} + +tasks.register('dockerFile') { + buildDir.mkdir() + + var df = new File(buildDir, "Dockerfile") + doLast { + df.text = """# +# I'm auto-generated, please don't make changes to me or commit me to git +# +# The template exists in docker-service.gradle +# +FROM ${dockerImage} + +ADD ${application.applicationName}.tar / +RUN mkdir /wmsa + +ENV JAVA_TOOL_OPTIONS="${serviceToolOpts}" +ENV JAVA_OPTS="${serviceJvmOpts} " + +ENTRYPOINT WMSA_HOME=/wmsa /${application.applicationName}/bin/${application.applicationName} \${arg0} \${arg1} +""" + } + it.outputs.file(df) +} + +docker { + dockerfile = tasks.dockerFile.outputs.files.singleFile + name = 'marginalia.nu/'+application.applicationName+':latest' + files tasks.distTar.outputs + tags 'latest' + + dependsOn tasks.distTar + dependsOn tasks.dockerFile +} diff --git a/env/mariadb.env b/env/mariadb.env new file mode 100644 index 00000000..b7fec6ea --- /dev/null +++ b/env/mariadb.env @@ -0,0 +1,4 @@ +MARIADB_RANDOM_ROOT_PASSWORD=1 +MARIADB_DATABASE=WMSA_prod +MARIADB_USER=wmsa +MARIADB_PASSWORD=wmsa \ No newline at end of file diff --git a/env/service.env b/env/service.env new file mode 100644 index 00000000..fef49dae --- /dev/null +++ b/env/service.env @@ -0,0 +1 @@ +WMSA_HOME=/home/vlofgren/Code/wmsa.local \ No newline at end of file diff --git a/features/domain-ranking/build.gradle b/features/domain-ranking/build.gradle new file mode 100644 index 00000000..ab6c8c40 --- /dev/null +++ b/features/domain-ranking/build.gradle @@ -0,0 +1,45 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id "de.undercouch.download" version "5.1.0" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':common:model') + implementation project(':common:service') + + implementation libs.lombok + annotationProcessor libs.lombok + + implementation libs.bundles.slf4j + implementation libs.bundles.mariadb + implementation libs.guice + implementation libs.notnull + implementation libs.roaringbitmap + implementation libs.trove + implementation libs.fastutil + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/DomainRankings.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/DomainRankings.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/DomainRankings.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/DomainRankings.java index d6ddcd62..b408f980 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/DomainRankings.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/DomainRankings.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.postings; +package nu.marginalia.ranking; import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/RankingAlgorithm.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/RankingAlgorithm.java index 2e8589e4..606f3e60 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/RankingAlgorithm.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.edge.index.ranking; +package nu.marginalia.ranking; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; import it.unimi.dsi.fastutil.ints.IntArrays; -import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; +import nu.marginalia.ranking.accumulator.RankingResultAccumulator; +import nu.marginalia.ranking.data.RankingDomainFetcher; +import nu.marginalia.ranking.data.RankingDomainData; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -82,6 +82,7 @@ public abstract class RankingAlgorithm { } }); } + logger.info("Origin Domains: {}", originDomainIds.size()); } @@ -248,7 +249,7 @@ public abstract class RankingAlgorithm { public RankingResultAccumulator getRanking(int numResults, Supplier> accumulatorP) { - if (numResults < 0) { + if (numResults <= 0) { numResults = domainIdToIndex.size(); } numResults = min(numResults, min(domainIdToIndex.size(), rank.length)); @@ -265,6 +266,7 @@ public abstract class RankingAlgorithm { return accumulator; } + private static int[] sortOrder(double[] values) { int[] ret = new int[values.length]; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/ReversePageRank.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/ReversePageRank.java index 0c202958..76b138f9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/ReversePageRank.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.index.ranking; +package nu.marginalia.ranking; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; +import nu.marginalia.ranking.data.RankingDomainFetcher; public class ReversePageRank extends RankingAlgorithm { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/StandardPageRank.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/StandardPageRank.java index d9302fd6..0c629c96 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/StandardPageRank.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.index.ranking; +package nu.marginalia.ranking; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; +import nu.marginalia.ranking.data.RankingDomainFetcher; public class StandardPageRank extends RankingAlgorithm { @@ -22,10 +22,14 @@ public class StandardPageRank extends RankingAlgorithm { for (int j = 0; j < links.size(); j++) { int linkedDomain = links.getQuick(j); - int linkSize = 1; - var bl = linkDataSrc2Dest[linkedDomain]; - if (bl != null) { - linkSize = bl.size(); + final int linkSize; + var backLinks = linkDataSrc2Dest[linkedDomain]; + + if (backLinks == null) { + linkSize = 1; + } + else { + linkSize = backLinks.size(); } newRankValue += rank.get(linkedDomain) / linkSize; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultAccumulator.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultAccumulator.java similarity index 63% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultAccumulator.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultAccumulator.java index fea37b00..e9055f6e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultAccumulator.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultAccumulator.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.ranking.accumulator; +package nu.marginalia.ranking.accumulator; public interface RankingResultAccumulator { void add(int domainId, int rank); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultBitSetAccumulator.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultBitSetAccumulator.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultBitSetAccumulator.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultBitSetAccumulator.java index 26e72522..3a806d95 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultBitSetAccumulator.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultBitSetAccumulator.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.ranking.accumulator; +package nu.marginalia.ranking.accumulator; import org.roaringbitmap.RoaringBitmap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultHashMapAccumulator.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultHashMapAccumulator.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultHashMapAccumulator.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultHashMapAccumulator.java index 653806ed..15365466 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultHashMapAccumulator.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultHashMapAccumulator.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.ranking.accumulator; +package nu.marginalia.ranking.accumulator; import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultListAccumulator.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultListAccumulator.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultListAccumulator.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultListAccumulator.java index 663483e4..ecfab27c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultListAccumulator.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/accumulator/RankingResultListAccumulator.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.ranking.accumulator; +package nu.marginalia.ranking.accumulator; import gnu.trove.list.array.TIntArrayList; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainData.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainData.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainData.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainData.java index 4a59daf4..6d13fd09 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainData.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainData.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.index.ranking.data; +package nu.marginalia.ranking.data; import lombok.AllArgsConstructor; import lombok.Data; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; @Data @AllArgsConstructor diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java index ff2b7e18..a330ede7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.index.ranking.data; +package nu.marginalia.ranking.data; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java index dddaeebb..738ecb55 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.index.ranking.data; +package nu.marginalia.ranking.data; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; import org.slf4j.LoggerFactory; import java.sql.SQLException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/CreateBrowseDomainRanksTool.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/CreateBrowseDomainRanksTool.java similarity index 83% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/CreateBrowseDomainRanksTool.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/CreateBrowseDomainRanksTool.java index f4cb6197..4ff472cc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/CreateBrowseDomainRanksTool.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/CreateBrowseDomainRanksTool.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.edge.index.ranking.tool; +package nu.marginalia.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; -import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.ranking.StandardPageRank; +import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; +import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; +import nu.marginalia.service.module.DatabaseModule; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PerusePageRankV2.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PerusePageRankV2.java index 4fbdd08b..0e615552 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PerusePageRankV2.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.ranking.tool; +package nu.marginalia.ranking.tool; import com.zaxxer.hikari.HikariDataSource; @@ -10,11 +10,11 @@ import it.unimi.dsi.fastutil.ints.IntArrays; import it.unimi.dsi.fastutil.ints.IntComparator; import lombok.AllArgsConstructor; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.ranking.RankingAlgorithm; +import nu.marginalia.ranking.data.RankingDomainData; +import nu.marginalia.ranking.data.RankingDomainFetcher; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.service.module.DatabaseModule; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PrintDomainRanksTool.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PrintDomainRanksTool.java similarity index 78% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PrintDomainRanksTool.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PrintDomainRanksTool.java index 60f12008..d608abad 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PrintDomainRanksTool.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PrintDomainRanksTool.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.edge.index.ranking.tool; +package nu.marginalia.ranking.tool; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; -import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; +import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; +import nu.marginalia.ranking.data.RankingDomainFetcher; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.ranking.StandardPageRank; +import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; +import nu.marginalia.service.module.DatabaseModule; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool.java b/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/UpdateDomainRanksTool.java similarity index 83% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool.java rename to features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/UpdateDomainRanksTool.java index 714e3028..804df19e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool.java +++ b/features/domain-ranking/src/main/java/nu/marginalia/ranking/tool/UpdateDomainRanksTool.java @@ -1,12 +1,13 @@ -package nu.marginalia.wmsa.edge.index.ranking.tool; +package nu.marginalia.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; -import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; +import nu.marginalia.ranking.StandardPageRank; +import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; +import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; + +import nu.marginalia.model.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.service.module.DatabaseModule; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,8 +33,7 @@ public class UpdateDomainRanksTool { var uploader = new Thread(() -> uploadThread(conn), "Uploader"); logger.info("Ranking"); - var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds)); + var domains = new RankingDomainFetcherForSimilarityData(conn, new EdgeDomainBlacklistImpl(conn)); var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com"); rankMax = rpr.size(); diff --git a/features/query-parser/build.gradle b/features/query-parser/build.gradle new file mode 100644 index 00000000..422170ff --- /dev/null +++ b/features/query-parser/build.gradle @@ -0,0 +1,43 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} +dependencies { + implementation project(':libraries:language-processing') + implementation project(':libraries:misc') + implementation project(':common:config') + implementation project(':common:model') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + implementation libs.bundles.nlp + + implementation libs.bundles.handlebars + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + maxHeapSize = "8G" + useJUnitPlatform() +} + +task fastTests(type: Test) { + maxHeapSize = "8G" + useJUnitPlatform { + excludeTags "slow" + } +} + diff --git a/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryParser.java b/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryParser.java new file mode 100644 index 00000000..eebf2daa --- /dev/null +++ b/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryParser.java @@ -0,0 +1,105 @@ +package nu.marginalia.query_parser; + +import nu.marginalia.language.WordPatterns; +import nu.marginalia.query_parser.token.Token; +import nu.marginalia.query_parser.token.TokenType; +import nu.marginalia.util.TransformList; + +import java.util.List; + +public class QueryParser { + + private final QueryTokenizer tokenizer = new QueryTokenizer(); + + public List parse(String query) { + List basicTokens = tokenizer.tokenizeQuery(query); + + TransformList list = new TransformList<>(basicTokens); + + list.transformEach(QueryParser::handleQuoteTokens); + list.transformEach(QueryParser::trimLiterals); + list.transformEachPair(QueryParser::createNegatedTerms); + list.transformEachPair(QueryParser::createPriorityTerms); + list.transformEach(QueryParser::handleSpecialOperations); + list.scanAndTransform(TokenType.LPAREN, TokenType.RPAREN, QueryParser::handleAdvisoryTerms); + + return list.getBackingList(); + } + + private static void handleQuoteTokens(TransformList.Entity entity) { + var t = entity.value(); + if (t.type == TokenType.QUOT) { + entity.replace(new Token(TokenType.QUOT_TERM, + t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER), + t.displayStr)); + } + } + + private static void trimLiterals(TransformList.Entity entity) { + var t = entity.value(); + + if (t.type == TokenType.LITERAL_TERM + && (t.str.endsWith(":") || t.str.endsWith(".")) + && t.str.length() > 1) { + entity.replace(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length() - 1), t.displayStr)); + } + + } + + private static void createNegatedTerms(TransformList.Entity first, TransformList.Entity second) { + var t = first.value(); + var tn = second.value(); + + if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) { + first.remove(); + second.replace(new Token(TokenType.EXCLUDE_TERM, tn.str, "-" + tn.str)); + } + } + + private static void createPriorityTerms(TransformList.Entity first, TransformList.Entity second) { + var t = first.value(); + var tn = second.value(); + + if (t.type == TokenType.QMARK && tn.type == TokenType.LITERAL_TERM) { + first.remove(); + second.replace(new Token(TokenType.PRIORTY_TERM, tn.str, "?" + tn.str)); + } + } + + private static void handleSpecialOperations(TransformList.Entity entity) { + var t = entity.value(); + if (t.type != TokenType.LITERAL_TERM) { + return; + } + + if (t.str.startsWith("q") && t.str.matches("q[=><]\\d+")) { + entity.replace(new Token(TokenType.QUALITY_TERM, t.str.substring(1), t.displayStr)); + } else if (t.str.startsWith("near:")) { + entity.replace(new Token(TokenType.NEAR_TERM, t.str.substring(5), t.displayStr)); + } else if (t.str.startsWith("year") && t.str.matches("year[=><]\\d{4}")) { + entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr)); + } else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) { + entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr)); + } else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) { + entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr)); + } else if (t.str.startsWith("qs=")) { + entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr)); + } else if (t.str.contains(":")) { + entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr)); + } + } + + private static void handleAdvisoryTerms(TransformList.Entity entity) { + var t = entity.value(); + if (t.type == TokenType.LPAREN) { + entity.remove(); + } else if (t.type == TokenType.RPAREN) { + entity.remove(); + } else if (t.type == TokenType.LITERAL_TERM) { + entity.replace(new Token(TokenType.ADVICE_TERM, t.str, "(" + t.str + ")")); + } + } + + +} + diff --git a/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryPermutation.java b/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryPermutation.java new file mode 100644 index 00000000..1a51a5b8 --- /dev/null +++ b/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryPermutation.java @@ -0,0 +1,220 @@ +package nu.marginalia.query_parser; + +import nu.marginalia.language.WordPatterns; +import nu.marginalia.query_parser.token.Token; +import nu.marginalia.query_parser.token.TokenType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +import static java.util.stream.Stream.concat; + +public class QueryPermutation { + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final QueryVariants queryVariants; + + + public QueryPermutation(QueryVariants queryVariants) { + this.queryVariants = queryVariants; + } + + public List> permuteQueries(List items) { + int start = -1; + int end = items.size(); + + for (int i = 0; i < items.size(); i++) { + var token = items.get(i); + + if (start < 0) { + if (token.type == TokenType.LITERAL_TERM && WordPatterns.wordQualitiesPredicate.test(token.str)) { + start = i; + } + } + else { + if (token.type != TokenType.LITERAL_TERM || !WordPatterns.wordPredicateEither.test(token.str)) { + end = i; + break; + } + } + } + + if (start >= 0 && end - start > 1) { + List> permuteParts = combineSearchTerms(items.subList(start, end)); + int s = start; + int e = end; + return permuteParts.stream().map(part -> + concat(items.subList(0, s).stream(), concat(part.stream(), items.subList(e, items.size()).stream())) + .collect(Collectors.toList())) + .peek(lst -> lst.removeIf(this::isJunkWord)) + .limit(24) + .collect(Collectors.toList()); + } + else { + return List.of(items); + } + } + + + public List> permuteQueriesNew(List items) { + int start = -1; + int end = items.size(); + + for (int i = 0; i < items.size(); i++) { + var token = items.get(i); + + if (start < 0) { + if (token.type == TokenType.LITERAL_TERM && WordPatterns.wordQualitiesPredicate.test(token.str)) { + start = i; + } + } + else { + if (token.type != TokenType.LITERAL_TERM || !WordPatterns.wordPredicateEither.test(token.str)) { + end = i; + break; + } + } + } + + if (start >= 0 && end - start >= 1) { + var result = queryVariants.getQueryVariants(items.subList(start, end)); + + logger.debug("{}", result); + + if (result.isEmpty()) { + logger.warn("Empty variants result, falling back on old code"); + return permuteQueries(items); + } + + List> queryVariants = new ArrayList<>(); + for (var query : result.faithful) { + var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); + tokens.addAll(result.nonLiterals); + + queryVariants.add(tokens); + } + for (var query : result.alternative) { + if (queryVariants.size() >= 6) + break; + + var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); + tokens.addAll(result.nonLiterals); + + queryVariants.add(tokens); + } + + List> returnValue = new ArrayList<>(queryVariants.size()); + for (var variant: queryVariants) { + List r = new ArrayList<>(start + variant.size() + (items.size() - end)); + r.addAll(items.subList(0, start)); + r.addAll(variant); + r.addAll(items.subList(end, items.size())); + returnValue.add(r); + } + + return returnValue; + + } + else { + return List.of(items); + } + } + + private boolean isJunkWord(Token token) { + if (WordPatterns.isStopWord(token.str) && + !token.str.matches("^(\\d+|([a-z]+:.*))$")) { + return true; + } + return switch (token.str) { + case "vs", "versus", "or", "and" -> true; + default -> false; + }; + } + + private List> combineSearchTerms(List subList) { + int size = subList.size(); + if (size < 1) { + return Collections.emptyList(); + } + else if (size == 1) { + if (WordPatterns.isStopWord(subList.get(0).str)) { + return Collections.emptyList(); + } + return List.of(subList); + } + + List> results = new ArrayList<>(size*(size+1)/2); + + if (subList.size() <= 4 && subList.get(0).str.length() >= 2 && !isPrefixWord(subList.get(subList.size()-1).str)) { + results.add(List.of(joinTokens(subList))); + } + outer: for (int i = size - 1; i >= 1; i--) { + + var left = combineSearchTerms(subList.subList(0, i)); + var right = combineSearchTerms(subList.subList(i, size)); + + for (var l : left) { + if (results.size() > 48) { + break outer; + } + + for (var r : right) { + if (results.size() > 48) { + break outer; + } + + List combined = new ArrayList<>(l.size() + r.size()); + combined.addAll(l); + combined.addAll(r); + if (!results.contains(combined)) { + results.add(combined); + } + } + } + } + if (!results.contains(subList)) { + results.add(subList); + } + Comparator> tc = (o1, o2) -> { + int dJoininess = o2.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum() - + o1.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum(); + if (dJoininess == 0) { + return (o2.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum() - + o1.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum()); + } + return (int) Math.signum(dJoininess); + }; + results.sort(tc); + return results; + } + + private boolean isPrefixWord(String str) { + return switch (str) { + case "the", "of", "when" -> true; + default -> false; + }; + } + + int joininess(String s) { + return (int) s.chars().filter(c -> c == '_').count(); + } + int rightiness(String s) { + int rightiness = 0; + for (int i = 0; i < s.length(); i++) { + if (s.charAt(i) == '_') { + rightiness+=i; + } + } + return rightiness; + } + + private Token joinTokens(List subList) { + return new Token(TokenType.LITERAL_TERM, + subList.stream().map(t -> t.str).collect(Collectors.joining("_")), + subList.stream().map(t -> t.str).collect(Collectors.joining(" "))); + } +} diff --git a/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryTokenizer.java b/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryTokenizer.java new file mode 100644 index 00000000..8ca580db --- /dev/null +++ b/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryTokenizer.java @@ -0,0 +1,65 @@ +package nu.marginalia.query_parser; + +import nu.marginalia.language.encoding.AsciiFlattener; +import nu.marginalia.query_parser.token.Token; +import nu.marginalia.query_parser.token.TokenType; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class QueryTokenizer { + private static final Pattern noisePattern = Pattern.compile("[,]"); + + public List tokenizeQuery(String rawQuery) { + List tokens = new ArrayList<>(); + + String query = AsciiFlattener.flattenUnicode(rawQuery); + query = noisePattern.matcher(query).replaceAll(" "); + + for (int i = 0; i < query.length(); i++) { + int chr = query.charAt(i); + + if ('(' == chr) { + tokens.add(new Token(TokenType.LPAREN, "(", "(")); + } + else if (')' == chr) { + tokens.add(new Token(TokenType.RPAREN, ")", ")")); + } + else if ('"' == chr) { + int end = query.indexOf('"', i+1); + if (end == -1) { + end = query.length(); + } + tokens.add(new Token(TokenType.QUOT, + query.substring(i+1, end).toLowerCase(), + query.substring(i, Math.min(query.length(), end+1)))); + i = end; + } + else if ('-' == chr) { + tokens.add(new Token(TokenType.MINUS, "-")); + } + else if ('?' == chr) { + tokens.add(new Token(TokenType.QMARK, "?")); + } + else if (Character.isSpaceChar(chr)) { + // + } + else { + + int end = i+1; + for (; end < query.length(); end++) { + if (query.charAt(end) == ' ' || query.charAt(end) == ')') + break; + } + tokens.add(new Token(TokenType.LITERAL_TERM, + query.substring(i, end).toLowerCase(), + query.substring(i, end))); + i = end-1; + } + } + return tokens; + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java b/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java rename to features/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java index 6d42b599..cef1be5b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java +++ b/features/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java @@ -1,29 +1,25 @@ -package nu.marginalia.wmsa.edge.search.query; +package nu.marginalia.query_parser; -import com.google.inject.Inject; -import com.google.inject.Singleton; import lombok.AllArgsConstructor; import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.ToString; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.KeywordExtractor; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.util.language.processing.model.DocumentSentence; -import nu.marginalia.util.language.processing.model.WordSpan; -import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.LanguageModels; +import nu.marginalia.language.statistics.EnglishDictionary; +import nu.marginalia.language.keywords.KeywordExtractor; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.statistics.NGramBloomFilter; +import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.model.WordSpan; +import nu.marginalia.query_parser.token.Token; +import nu.marginalia.query_parser.token.TokenType; import opennlp.tools.stemmer.PorterStemmer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.util.*; import java.util.regex.Pattern; -@Singleton public class QueryVariants { - - private final Logger logger = LoggerFactory.getLogger(getClass()); private final KeywordExtractor keywordExtractor; private final TermFrequencyDict dict; private final PorterStemmer ps = new PorterStemmer(); @@ -32,7 +28,6 @@ public class QueryVariants { private final EnglishDictionary englishDictionary; private final ThreadLocal sentenceExtractor; - @Inject public QueryVariants(LanguageModels lm, TermFrequencyDict dict, NGramBloomFilter nGramBloomFilter, diff --git a/features/query-parser/src/main/java/nu/marginalia/query_parser/token/Token.java b/features/query-parser/src/main/java/nu/marginalia/query_parser/token/Token.java new file mode 100644 index 00000000..47290632 --- /dev/null +++ b/features/query-parser/src/main/java/nu/marginalia/query_parser/token/Token.java @@ -0,0 +1,32 @@ +package nu.marginalia.query_parser.token; + +import lombok.EqualsAndHashCode; +import lombok.ToString; +import lombok.With; + +@ToString +@EqualsAndHashCode +@With +public class Token { + public TokenType type; + public String str; + public final String displayStr; + + public Token(TokenType type, String str, String displayStr) { + this.type = type; + this.str = str; + this.displayStr = safeString(displayStr); + } + + + public Token(TokenType type, String str) { + this.type = type; + this.str = str; + this.displayStr = safeString(str); + } + + private static String safeString(String s) { + return s.replaceAll("<", "<") + .replaceAll(">", ">"); + } +} diff --git a/features/query-parser/src/main/java/nu/marginalia/query_parser/token/TokenType.java b/features/query-parser/src/main/java/nu/marginalia/query_parser/token/TokenType.java new file mode 100644 index 00000000..dc25b332 --- /dev/null +++ b/features/query-parser/src/main/java/nu/marginalia/query_parser/token/TokenType.java @@ -0,0 +1,34 @@ +package nu.marginalia.query_parser.token; + +import java.util.function.Predicate; + +public enum TokenType implements Predicate { + TERM, + + + LITERAL_TERM, + QUOT_TERM, + EXCLUDE_TERM, + ADVICE_TERM, + PRIORTY_TERM, + + QUALITY_TERM, + YEAR_TERM, + SIZE_TERM, + RANK_TERM, + NEAR_TERM, + + QS_TERM, + + QUOT, + MINUS, + QMARK, + LPAREN, + RPAREN, + + IGNORE; + + public boolean test(Token t) { + return t.type == this; + } +} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java b/features/query-parser/src/test/java/nu/marginalia/query_parser/BodyQueryParserTest.java similarity index 78% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java rename to features/query-parser/src/test/java/nu/marginalia/query_parser/BodyQueryParserTest.java index 4b040983..27f282b4 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java +++ b/features/query-parser/src/test/java/nu/marginalia/query_parser/BodyQueryParserTest.java @@ -1,9 +1,11 @@ -package nu.marginalia.wmsa.edge.search.query; +package nu.marginalia.query_parser; +import nu.marginalia.LanguageModels; +import nu.marginalia.language.statistics.EnglishDictionary; +import nu.marginalia.language.statistics.NGramBloomFilter; +import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.query_parser.token.TokenType; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -19,6 +21,7 @@ class BodyQueryParserTest { private static EnglishDictionary englishDictionary; private static NGramBloomFilter nGramBloomFilter; private static final LanguageModels lm = TestLanguageModels.getLanguageModels(); + private QueryPermutation permutation; @BeforeAll public static void init() throws IOException { @@ -29,7 +32,8 @@ class BodyQueryParserTest { @BeforeEach public void setUp() { - parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary)); + parser = new QueryParser(); + permutation = new QueryPermutation(new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary)); } @Test @@ -52,7 +56,7 @@ class BodyQueryParserTest { results.forEach(System.out::println); assertEquals(TokenType.QUOT_TERM, results.get(0).type); assertEquals("hello_world", results.get(0).str); - assertEquals("\u201Chello world\u201D", results.get(0).displayStr); + assertEquals("\"hello world\"", results.get(0).displayStr); } @Test @@ -75,7 +79,7 @@ class BodyQueryParserTest { @Test void parseCombined() { - for (var list : parser.permuteQueries(parser.parse("dune 2 remake"))) { + for (var list : permutation.permuteQueries(parser.parse("dune 2 remake"))) { for (var t: list) { System.out.printf("%s ", t.str); } @@ -84,7 +88,7 @@ class BodyQueryParserTest { } @Test void parseCombinedDOS() { - for (var list : parser.permuteQueries(parser.parse("ab ba baa abba baba ab ba"))) { + for (var list : permutation.permuteQueries(parser.parse("ab ba baa abba baba ab ba"))) { for (var t: list) { System.out.printf("%s ", t.str); } @@ -94,7 +98,7 @@ class BodyQueryParserTest { @Test void parseCombinedSuperman() { - for (var list : parser.permuteQueries(parser.parse("wizardry proving grounds of the mad overlord"))) { + for (var list : permutation.permuteQueries(parser.parse("wizardry proving grounds of the mad overlord"))) { for (var t: list) { System.out.printf("%s ", t.str); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java b/features/query-parser/src/test/java/nu/marginalia/query_parser/QueryParserTest.java similarity index 62% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java rename to features/query-parser/src/test/java/nu/marginalia/query_parser/QueryParserTest.java index 734415e9..dbeaeb7b 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java +++ b/features/query-parser/src/test/java/nu/marginalia/query_parser/QueryParserTest.java @@ -1,15 +1,20 @@ -package nu.marginalia.wmsa.edge.search.query; +package nu.marginalia.query_parser; +import nu.marginalia.LanguageModels; +import nu.marginalia.language.statistics.EnglishDictionary; +import nu.marginalia.language.statistics.NGramBloomFilter; +import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.query_parser.token.Token; +import nu.marginalia.query_parser.token.TokenType; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; +import java.util.List; import java.util.stream.Collectors; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; class QueryParserTest { @@ -25,7 +30,7 @@ class QueryParserTest { nGramBloomFilter = new NGramBloomFilter(lm); englishDictionary = new EnglishDictionary(dict); - parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary)); + parser = new QueryParser(); } @Test @@ -62,10 +67,19 @@ class QueryParserTest { } @Test - void variantQueries() { - var r = parser.parse("car stemming"); - parser.variantQueries(r).forEach(query -> { - System.out.println(query.stream().map(t -> t.str).collect(Collectors.joining(", "))); - }); + public void testNonAsciiNames() { + verifyParseResult("André the Giant", "andre", "the", "giant"); + verifyParseResult("Stanisław Lem", "stanislaw", "lem"); + verifyParseResult("Nicolae Ceaușescu", "nicolae", "ceausescu"); + verifyParseResult("Þorrablót", "thorrablot"); + verifyParseResult("Karolis Koncevičius", "karolis", "koncevicius"); } + + private void verifyParseResult(String query, String... expectedTokens) { + assertArrayEquals(expectedTokens, getTokenStrings(parser.parse(query))); + } + private String[] getTokenStrings(List tokens) { + return tokens.stream().map(t -> t.str).toArray(String[]::new); + } + } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/features/query-parser/src/test/java/nu/marginalia/query_parser/QueryVariantsTest.java similarity index 83% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java rename to features/query-parser/src/test/java/nu/marginalia/query_parser/QueryVariantsTest.java index b2477ca8..0abd0cc1 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/features/query-parser/src/test/java/nu/marginalia/query_parser/QueryVariantsTest.java @@ -1,10 +1,11 @@ -package nu.marginalia.wmsa.edge.search.query; +package nu.marginalia.query_parser; +import nu.marginalia.LanguageModels; +import nu.marginalia.language.statistics.EnglishDictionary; +import nu.marginalia.language.statistics.NGramBloomFilter; +import nu.marginalia.language.statistics.TermFrequencyDict; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.language.sentence.SentenceExtractor; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -24,7 +25,7 @@ class QueryVariantsTest { var dict = new TermFrequencyDict(lm); var ngrams = new NGramBloomFilter(lm); variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(dict)); - parser = new QueryParser(new EnglishDictionary(dict), variants); + parser = new QueryParser(); } @Test @@ -64,6 +65,7 @@ class QueryVariantsTest { testCase("Knitting"); testCase("capcom"); testCase("the man of tomorrow"); + } private void testCase(String input) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java b/features/query-parser/src/test/java/nu/marginalia/util/TestLanguageModels.java similarity index 92% rename from marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java rename to features/query-parser/src/test/java/nu/marginalia/util/TestLanguageModels.java index cdd23c4f..81df1ed9 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java +++ b/features/query-parser/src/test/java/nu/marginalia/util/TestLanguageModels.java @@ -1,7 +1,7 @@ package nu.marginalia.util; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.LanguageModels; +import nu.marginalia.WmsaHome; import java.nio.file.Files; import java.nio.file.Path; diff --git a/features/random-websites/build.gradle b/features/random-websites/build.gradle new file mode 100644 index 00000000..b3fb2692 --- /dev/null +++ b/features/random-websites/build.gradle @@ -0,0 +1,55 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id "me.champeau.jmh" version "0.6.6" + id "de.undercouch.download" version "5.1.0" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:service') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.guice + implementation libs.roaringbitmap + implementation libs.trove + implementation libs.fastutil + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +configurations { + e2eTestImplementation.extendsFrom(testImplementation) + +} + +test { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform() +} + +task fastTests(type: Test) { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsFromUrlId.java b/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsFromUrlId.java new file mode 100644 index 00000000..27be5967 --- /dev/null +++ b/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsFromUrlId.java @@ -0,0 +1,71 @@ +package nu.marginalia.browse; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.EdgeIdCollection; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.*; + +@Singleton +public class DbBrowseDomainsFromUrlId { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final HikariDataSource dataSource; + + @Inject + public DbBrowseDomainsFromUrlId(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + private String idList(EdgeIdCollection ids) { + StringJoiner j = new StringJoiner(",", "(", ")"); + for (var id : ids.values()) { + j.add(Integer.toString(id)); + } + return j.toString(); + } + + public List getBrowseResultFromUrlIds(EdgeIdCollection urlIds) { + if (urlIds.isEmpty()) + return Collections.emptyList(); + + List ret = new ArrayList<>(urlIds.size()); + + try (var conn = dataSource.getConnection()) { + try (var stmt = conn.createStatement()) { + + String inStmt = idList(urlIds); + + var rsp = stmt.executeQuery(""" + SELECT DOMAIN_ID, DOMAIN_NAME + FROM EC_URL_VIEW + INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID + WHERE + KNOWN_URLS<5000 + AND QUALITY>-10 + AND EC_URL_VIEW.ID IN + """ + inStmt); // this injection is safe, inStmt is derived from concatenating a list of integers + while (rsp.next()) { + int id = rsp.getInt(1); + String domain = rsp.getString(2); + + ret.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0)); + } + } + } + catch (SQLException ex) { + logger.error("SQL error", ex); + } + + return ret; + } + + +} diff --git a/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsRandom.java b/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsRandom.java new file mode 100644 index 00000000..2f0b4cc0 --- /dev/null +++ b/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsRandom.java @@ -0,0 +1,60 @@ +package nu.marginalia.browse; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.*; + +@Singleton +public class DbBrowseDomainsRandom { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final HikariDataSource dataSource; + + @Inject + public DbBrowseDomainsRandom(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public List getRandomDomains(int count, EdgeDomainBlacklist blacklist, int set) { + + final String q = """ + SELECT DOMAIN_ID, DOMAIN_NAME + FROM EC_RANDOM_DOMAINS + INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID + WHERE STATE<2 + AND DOMAIN_SET=? + AND DOMAIN_ALIAS IS NULL + ORDER BY RAND() + LIMIT ? + """; + List domains = new ArrayList<>(count); + try (var conn = dataSource.getConnection()) { + try (var stmt = conn.prepareStatement(q)) { + stmt.setInt(1, set);; + stmt.setInt(2, count); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + int id = rsp.getInt(1); + String domain = rsp.getString(2); + + if (!blacklist.isBlacklisted(id)) { + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0)); + } + } + } + } + catch (SQLException ex) { + logger.error("SQL error", ex); + } + return domains; + } + +} diff --git a/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java b/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java new file mode 100644 index 00000000..6f3d9bd8 --- /dev/null +++ b/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java @@ -0,0 +1,66 @@ +package nu.marginalia.browse; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.model.id.EdgeId; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.*; + +@Singleton +public class DbBrowseDomainsSimilarCosine { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final HikariDataSource dataSource; + + @Inject + public DbBrowseDomainsSimilarCosine(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public List getDomainNeighborsAdjacentCosine(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { + List domains = new ArrayList<>(count); + + String q = """ + SELECT + EC_DOMAIN.ID, + NV.NEIGHBOR_NAME, + NV.RELATEDNESS + FROM EC_NEIGHBORS_VIEW NV + INNER JOIN DATA_DOMAIN_SCREENSHOT ON DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME=NV.NEIGHBOR_NAME + INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID + WHERE NV.DOMAIN_ID=? + GROUP BY NV.NEIGHBOR_ID + ORDER BY NV.RELATEDNESS DESC + """; + + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement(q)) { + stmt.setFetchSize(count); + stmt.setInt(1, domainId.id()); + stmt.setInt(2, count); + var rsp = stmt.executeQuery(); + while (rsp.next() && domains.size() < count) { + int id = rsp.getInt(1); + String domain = rsp.getString(2); + double relatedness = rsp.getDouble(3); + + if (!blacklist.isBlacklisted(id)) { + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, relatedness)); + } + } + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + + return domains; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDaoImpl.java b/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java similarity index 53% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDaoImpl.java rename to features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java index 1fdf93ef..01f43060 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDaoImpl.java +++ b/features/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java @@ -1,179 +1,31 @@ -package nu.marginalia.wmsa.edge.dbcommon; +package nu.marginalia.browse; -import com.google.common.base.Strings; -import com.google.common.cache.Cache; -import com.google.common.cache.CacheBuilder; -import com.google.common.util.concurrent.UncheckedExecutionException; import com.google.inject.Inject; +import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; -import nu.marginalia.wmsa.edge.model.id.EdgeId; -import nu.marginalia.wmsa.edge.model.id.EdgeIdCollection; -import nu.marginalia.wmsa.edge.model.search.EdgePageScoreAdjustment; -import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; -import nu.marginalia.wmsa.edge.search.model.BrowseResult; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.model.id.EdgeId; +import nu.marginalia.model.id.EdgeIdCollection; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; +@Singleton +public class DbBrowseDomainsSimilarOldAlgo { -public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { - private final HikariDataSource dataSource; private final Logger logger = LoggerFactory.getLogger(getClass()); + private final HikariDataSource dataSource; - private final Cache> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build(); - private final Cache> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build(); - - public static double QUALITY_LOWER_BOUND_CUTOFF = -15.; @Inject - public EdgeDataStoreDaoImpl(HikariDataSource dataSource) - { + public DbBrowseDomainsSimilarOldAlgo(HikariDataSource dataSource) { this.dataSource = dataSource; } - - public synchronized void clearCaches() - { - urlIdCache.invalidateAll(); - domainIdCache.invalidateAll(); - } - - @SneakyThrows - @Override - public EdgeId getDomainId(EdgeDomain domain) { - try (var connection = dataSource.getConnection()) { - - return domainIdCache.get(domain, () -> { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setString(1, domain.toString()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return new EdgeId<>(rsp.getInt(1)); - } - } - throw new NoSuchElementException(); - }); - } - catch (UncheckedExecutionException ex) { - throw ex.getCause(); - } - } - - private String idList(EdgeIdCollection ids) { - StringJoiner j = new StringJoiner(",", "(", ")"); - for (var id : ids.values()) { - j.add(Integer.toString(id)); - } - return j.toString(); - } - - @SneakyThrows - @Override - public List getUrlDetailsMulti(EdgeIdCollection ids) { - if (ids.isEmpty()) { - return Collections.emptyList(); - } - List result = new ArrayList<>(ids.size()); - - try (var connection = dataSource.getConnection()) { - - String idString = idList(ids); - - try (var stmt = connection.prepareStatement( - """ - SELECT ID, URL, - TITLE, DESCRIPTION, - QUALITY, - WORDS_TOTAL, FORMAT, FEATURES, - IP, DOMAIN_STATE, - DATA_HASH - FROM EC_URL_VIEW - WHERE TITLE IS NOT NULL - AND ID IN - """ + idString)) { - stmt.setFetchSize(ids.size()); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - EdgeUrl url = new EdgeUrl(rsp.getString(2)); - var val = new EdgeUrlDetails(rsp.getInt(1), url, - rsp.getString(3), // title - rsp.getString(4), // description - rsp.getDouble(5), // quality - rsp.getInt(6), // wordsTotal - rsp.getString(7), // format - rsp.getInt(8), // features - rsp.getString(9), // ip - EdgeDomainIndexingState.valueOf(rsp.getString(10)), // domainState - rsp.getInt(11), // dataHash - EdgePageScoreAdjustment.zero(), // urlQualityAdjustment - Integer.MAX_VALUE, // rankingId - Double.MAX_VALUE, // termScore - 1, // resultsFromSameDomain - "", // positions - null // result item - ); - if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF - && Strings.isNullOrEmpty(val.description) - && val.url.path.length() > 1) { - continue; - } - result.add(val); - - } - } - } - - return result; - } - - - - public List getDomainNeighborsAdjacentCosine(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { - List domains = new ArrayList<>(count); - - String q = """ - SELECT - EC_DOMAIN.ID, - NV.NEIGHBOR_NAME, - NV.RELATEDNESS - FROM EC_NEIGHBORS_VIEW NV - INNER JOIN DATA_DOMAIN_SCREENSHOT ON DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME=NV.NEIGHBOR_NAME - INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID - WHERE NV.DOMAIN_ID=? - GROUP BY NV.NEIGHBOR_ID - ORDER BY NV.RELATEDNESS DESC - """; - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement(q)) { - stmt.setFetchSize(count); - stmt.setInt(1, domainId.id()); - stmt.setInt(2, count); - var rsp = stmt.executeQuery(); - while (rsp.next() && domains.size() < count) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - double relatedness = rsp.getDouble(3); - - if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, relatedness)); - } - } - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - - return domains; - } - - @Override public List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { final Set domains = new HashSet<>(count*3); @@ -279,7 +131,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { return new ArrayList<>(domains); } - @Override public List getRandomDomains(int count, EdgeDomainBlacklist blacklist, int set) { final String q = """ @@ -305,7 +156,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { if (!blacklist.isBlacklisted(id)) { domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0)); } - } + } } } catch (SQLException ex) { @@ -314,7 +165,15 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { return domains; } - @Override + + private String idList(EdgeIdCollection ids) { + StringJoiner j = new StringJoiner(",", "(", ")"); + for (var id : ids.values()) { + j.add(Integer.toString(id)); + } + return j.toString(); + } + public List getBrowseResultFromUrlIds(EdgeIdCollection urlIds) { if (urlIds.isEmpty()) return Collections.emptyList(); @@ -350,19 +209,5 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { return ret; } - @Override - @SneakyThrows - public Optional getDomain(EdgeId id) { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, id.id()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeDomain(rsp.getString(1))); - } - return Optional.empty(); - } - } - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java b/features/random-websites/src/main/java/nu/marginalia/browse/experimental/AndCardIntSet.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java rename to features/random-websites/src/main/java/nu/marginalia/browse/experimental/AndCardIntSet.java index 08caa671..645618aa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java +++ b/features/random-websites/src/main/java/nu/marginalia/browse/experimental/AndCardIntSet.java @@ -1,4 +1,4 @@ -package nu.marginalia.util; +package nu.marginalia.browse.experimental; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; @@ -82,10 +82,6 @@ public class AndCardIntSet { if (!testHash(a,b)) { return 0; } -// -// if (a.getCardinality() + b.getCardinality() < 10) { -// return andLinearSmall(a, b); -// } return andLinear(a,b); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java b/features/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeDomainLinkConsineSimilarityMain.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java rename to features/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeDomainLinkConsineSimilarityMain.java index 56c9c65b..c4d4e0b2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java +++ b/features/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeDomainLinkConsineSimilarityMain.java @@ -1,15 +1,14 @@ -package nu.marginalia.util.tool; +package nu.marginalia.browse.experimental; import com.zaxxer.hikari.HikariDataSource; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; -import nu.marginalia.util.AndCardIntSet; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDaoImpl; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.id.EdgeId; +import nu.marginalia.model.dbcommon.DbDomainQueries; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.id.EdgeId; +import nu.marginalia.service.module.DatabaseModule; import org.roaringbitmap.RoaringBitmap; import java.sql.ResultSet; @@ -19,7 +18,7 @@ import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; -import static nu.marginalia.util.AndCardIntSet.*; +import static nu.marginalia.browse.experimental.AndCardIntSet.*; public class EdgeDomainLinkConsineSimilarityMain { ArrayList idsList = new ArrayList<>(100_000); @@ -98,7 +97,7 @@ public class EdgeDomainLinkConsineSimilarityMain { @SneakyThrows public void tryDomains(String... domainName) { - var dataStoreDao = new EdgeDataStoreDaoImpl(dataSource); + var dataStoreDao = new DbDomainQueries(dataSource); System.out.println(Arrays.toString(domainName)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeWordWordConsineSimilarityMain.java b/features/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeWordWordConsineSimilarityMain.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeWordWordConsineSimilarityMain.java rename to features/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeWordWordConsineSimilarityMain.java index 8e71b26d..d05be9b5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeWordWordConsineSimilarityMain.java +++ b/features/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeWordWordConsineSimilarityMain.java @@ -1,10 +1,9 @@ -package nu.marginalia.util.tool; +package nu.marginalia.browse.experimental; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import it.unimi.dsi.fastutil.ints.IntSet; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import lombok.SneakyThrows; -import nu.marginalia.util.AndCardIntSet; import org.roaringbitmap.RoaringBitmap; import java.io.IOException; @@ -16,8 +15,8 @@ import java.util.function.Consumer; import java.util.stream.Collectors; import java.util.stream.IntStream; -import static nu.marginalia.util.AndCardIntSet.andCardinality; -import static nu.marginalia.util.AndCardIntSet.weightedProduct; +import static nu.marginalia.browse.experimental.AndCardIntSet.andCardinality; +import static nu.marginalia.browse.experimental.AndCardIntSet.weightedProduct; public class EdgeWordWordConsineSimilarityMain { final Object2IntOpenHashMap stringIds; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/BrowseResult.java b/features/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/BrowseResult.java rename to features/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java index 3a65ac47..41ab01d2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/BrowseResult.java +++ b/features/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.search.model; +package nu.marginalia.browse.model; -import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.model.EdgeUrl; public record BrowseResult (EdgeUrl url, int domainId, double relatedness) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/BrowseResultSet.java b/features/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResultSet.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/BrowseResultSet.java rename to features/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResultSet.java index da4af52c..75758593 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/BrowseResultSet.java +++ b/features/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResultSet.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.search.model; +package nu.marginalia.browse.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/AndCardIntSetTest.java b/features/random-websites/src/test/java/nu/marginalia/experimental/AndCardIntSetTest.java similarity index 87% rename from marginalia_nu/src/test/java/nu/marginalia/util/AndCardIntSetTest.java rename to features/random-websites/src/test/java/nu/marginalia/experimental/AndCardIntSetTest.java index 8f0e7d11..65f83952 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/AndCardIntSetTest.java +++ b/features/random-websites/src/test/java/nu/marginalia/experimental/AndCardIntSetTest.java @@ -1,5 +1,7 @@ -package nu.marginalia.util; +package nu.marginalia.experimental; +import nu.marginalia.browse.experimental.AndCardIntSet; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; diff --git a/features/renderer/build.gradle b/features/renderer/build.gradle new file mode 100644 index 00000000..1f1790da --- /dev/null +++ b/features/renderer/build.gradle @@ -0,0 +1,35 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} +dependencies { + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.bundles.handlebars + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} + diff --git a/features/renderer/src/main/java/nu/marginalia/renderer/MustacheRenderer.java b/features/renderer/src/main/java/nu/marginalia/renderer/MustacheRenderer.java new file mode 100644 index 00000000..4060a15b --- /dev/null +++ b/features/renderer/src/main/java/nu/marginalia/renderer/MustacheRenderer.java @@ -0,0 +1,60 @@ +package nu.marginalia.renderer; + +import com.github.jknack.handlebars.*; +import com.github.jknack.handlebars.helper.ConditionalHelpers; +import com.github.jknack.handlebars.io.ClassPathTemplateLoader; +import com.github.jknack.handlebars.io.TemplateLoader; +import lombok.SneakyThrows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public class MustacheRenderer { + private final Template template; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + MustacheRenderer(String templateFile) throws IOException { + + TemplateLoader loader = new ClassPathTemplateLoader(); + loader.setPrefix("/templates"); + loader.setSuffix(".hdb"); + + var handlebars = new Handlebars(loader); + handlebars.registerHelpers(ConditionalHelpers.class); + handlebars.registerHelper("md", new MarkdownHelper()); + + try { + template = handlebars.compile(templateFile); + logger.info("Loaded template " + templateFile); + } + catch (FileNotFoundException ex) { + throw new RenderingException("Could not find template " + templateFile); + } + catch (HandlebarsException ex) { + throw new RenderingException("Failed to load template " + templateFile, ex); + } + } + + @SneakyThrows + public String render(T model) { + return template.apply(model); + } + + @SneakyThrows + public String render(T model, String name, List children) { + Context ctx = Context.newBuilder(model).combine(name, children).build(); + + return template.apply(ctx); + } + + @SneakyThrows + public String render(T model, Map children) { + Context ctx = Context.newBuilder(model).combine(children).build(); + return template.apply(ctx); + } + +} diff --git a/features/renderer/src/main/java/nu/marginalia/renderer/RendererFactory.java b/features/renderer/src/main/java/nu/marginalia/renderer/RendererFactory.java new file mode 100644 index 00000000..8d191881 --- /dev/null +++ b/features/renderer/src/main/java/nu/marginalia/renderer/RendererFactory.java @@ -0,0 +1,13 @@ +package nu.marginalia.renderer; + +import java.io.IOException; + +public class RendererFactory { + + public RendererFactory() { + } + + public MustacheRenderer renderer(String template) throws IOException { + return new MustacheRenderer<>(template); + } +} diff --git a/features/renderer/src/main/java/nu/marginalia/renderer/RenderingException.java b/features/renderer/src/main/java/nu/marginalia/renderer/RenderingException.java new file mode 100644 index 00000000..30c01ec7 --- /dev/null +++ b/features/renderer/src/main/java/nu/marginalia/renderer/RenderingException.java @@ -0,0 +1,12 @@ +package nu.marginalia.renderer; + +import java.io.IOException; + +public class RenderingException extends IOException { + public RenderingException(String message) { + super(message); + } + public RenderingException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/features/screenshots/build.gradle b/features/screenshots/build.gradle new file mode 100644 index 00000000..880c4ccd --- /dev/null +++ b/features/screenshots/build.gradle @@ -0,0 +1,44 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:service') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.notnull + implementation libs.guice + implementation libs.spark + implementation libs.bundles.mariadb + implementation libs.commons.io + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java b/features/screenshots/src/main/java/nu/marginalia/screenshot/ScreenshotService.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java rename to features/screenshots/src/main/java/nu/marginalia/screenshot/ScreenshotService.java index 46324c1c..50769aa6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java +++ b/features/screenshots/src/main/java/nu/marginalia/screenshot/ScreenshotService.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.edge.assistant.screenshot; +package nu.marginalia.screenshot; import com.google.common.base.Strings; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.id.EdgeId; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.dbcommon.DbDomainQueries; +import nu.marginalia.model.id.EdgeId; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -20,14 +20,14 @@ import static java.lang.Integer.parseInt; public class ScreenshotService { - private final EdgeDataStoreDao edgeDataStoreDao; + private final DbDomainQueries domainQueries; private final HikariDataSource dataSource; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public ScreenshotService(EdgeDataStoreDao edgeDataStoreDao, HikariDataSource dataSource) { - this.edgeDataStoreDao = edgeDataStoreDao; + public ScreenshotService(DbDomainQueries dbDomainQueries, HikariDataSource dataSource) { + this.domainQueries = dbDomainQueries; this.dataSource = dataSource; } @@ -87,7 +87,7 @@ public class ScreenshotService { private Object serveSvgPlaceholder(Response response, int id) { - var domainName = edgeDataStoreDao.getDomain(new EdgeId<>(id)).map(Object::toString); + var domainName = domainQueries.getDomain(new EdgeId<>(id)).map(Object::toString); if (domainName.isEmpty()) { Spark.halt(404); } diff --git a/gradle.properties b/gradle.properties new file mode 100644 index 00000000..24759993 --- /dev/null +++ b/gradle.properties @@ -0,0 +1,2 @@ +org.gradle.parallel=true +org.gradle.caching=false \ No newline at end of file diff --git a/index/index-forward/build.gradle b/index/index-forward/build.gradle new file mode 100644 index 00000000..371df73a --- /dev/null +++ b/index/index-forward/build.gradle @@ -0,0 +1,46 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':libraries:array') + implementation project(':libraries:btree') + implementation project(':libraries:misc') + implementation project(':features:domain-ranking') + implementation project(':index:index-query') + implementation project(':index:index-journal') + implementation project(':index:lexicon') + implementation project(':common:model') + implementation project(':third-party') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.roaringbitmap + implementation libs.fastutil + implementation libs.trove + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java b/index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java similarity index 70% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java rename to index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java index 8d821c88..c3f2b0b4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java +++ b/index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.edge.index.postings.forward; +package nu.marginalia.index.forward; import com.upserve.uppend.blobs.NativeIO; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.index.postings.DomainRankings; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.array.LongArray; +import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.ranking.DomainRankings; import org.roaringbitmap.IntConsumer; import org.roaringbitmap.RoaringBitmap; import org.slf4j.Logger; @@ -17,8 +17,6 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*; - public class ForwardIndexConverter { private final File inputFile; @@ -45,12 +43,13 @@ public class ForwardIndexConverter { public void convert() throws IOException { deleteOldFiles(); - SearchIndexJournalReaderSingleFile journalReader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(inputFile.toPath())); - if (journalReader.fileHeader.fileSize() <= SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES) { + IndexJournalReaderSingleCompressedFile journalReader = new IndexJournalReaderSingleCompressedFile(inputFile.toPath()); + if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) { + logger.warn("Bailing: Journal is empty!"); return; } - logger.info("Converting {} {}",inputFile, journalReader.fileHeader); + logger.info("Converting {} {}", inputFile, journalReader.fileHeader); logger.info("Domain Rankings size = {}", domainRankings.size()); @@ -67,16 +66,16 @@ public class ForwardIndexConverter { logger.info("Creating Supplementary Indexes"); - LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ENTRY_SIZE * docsFileId.size()); + LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size()); journalReader.forEach(entry -> { - long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId()); + long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.urlId()); int ranking = domainRankings.getRanking(entry.domainId()); long meta = EdgePageDocumentsMetadata.encodeRank(entry.docMeta(), ranking); - docFileData.set(entryOffset + METADATA_OFFSET, meta); - docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId()); + docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); + docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId()); }); docFileData.force(); @@ -90,7 +89,7 @@ public class ForwardIndexConverter { } } - private LongArray getDocIds(Path outputFileDocs, SearchIndexJournalReader journalReader) throws IOException { + private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException { RoaringBitmap rbm = new RoaringBitmap(); journalReader.forEachUrlId(rbm::add); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexParameters.java b/index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexParameters.java rename to index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java index f019a40b..ca09c440 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexParameters.java +++ b/index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java @@ -1,8 +1,7 @@ -package nu.marginalia.wmsa.edge.index.postings.forward; +package nu.marginalia.index.forward; class ForwardIndexParameters { public static final int ENTRY_SIZE = 2; - public static final int DOMAIN_OFFSET = 0; public static final int METADATA_OFFSET = 1; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexReader.java b/index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java similarity index 70% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexReader.java rename to index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java index c4080574..e7f7f045 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexReader.java +++ b/index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.index.postings.forward; +package nu.marginalia.index.forward; import com.upserve.uppend.blobs.NativeIO; import gnu.trove.map.hash.TLongIntHashMap; -import nu.marginalia.util.array.LongArray; +import nu.marginalia.array.LongArray; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -10,8 +10,6 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*; - public class ForwardIndexReader { private final TLongIntHashMap ids; private final LongArray data; @@ -19,9 +17,14 @@ public class ForwardIndexReader { private final Logger logger = LoggerFactory.getLogger(getClass()); public ForwardIndexReader(Path idsFile, Path dataFile) throws IOException { - if (!Files.exists(dataFile) || - !Files.exists(idsFile) - ) { + if (!Files.exists(dataFile)) { + logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile); + ids = null; + data = null; + return; + } + else if (!Files.exists(idsFile)) { + logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile); ids = null; data = null; return; @@ -63,14 +66,14 @@ public class ForwardIndexReader { long offset = idxForDoc(docId); if (offset < 0) return 0; - return data.get(ENTRY_SIZE * offset + METADATA_OFFSET); + return data.get(ForwardIndexParameters.ENTRY_SIZE * offset + ForwardIndexParameters.METADATA_OFFSET); } public int getDomainId(long docId) { long offset = idxForDoc(docId); if (offset < 0) return 0; - return Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET)); + return Math.max(0, (int) data.get(ForwardIndexParameters.ENTRY_SIZE * offset + ForwardIndexParameters.DOMAIN_OFFSET)); } public DocPost docPost(long docId) { @@ -90,14 +93,14 @@ public class ForwardIndexReader { if (idx < 0) return 0; - return data.get(ENTRY_SIZE * idx + METADATA_OFFSET); + return data.get(ForwardIndexParameters.ENTRY_SIZE * idx + ForwardIndexParameters.METADATA_OFFSET); } public int domainId() { if (idx < 0) return 0; - return Math.max(0, (int) data.get(ENTRY_SIZE * idx + DOMAIN_OFFSET)); + return Math.max(0, (int) data.get(ForwardIndexParameters.ENTRY_SIZE * idx + ForwardIndexParameters.DOMAIN_OFFSET)); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java b/index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java rename to index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java index 81f671c5..e1cf7eef 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java +++ b/index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.index.postings.forward; +package nu.marginalia.index.forward; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.index.query.IndexQueryParams; -import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; -import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimitType; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.index.query.limit.SpecificationLimitType; +import nu.marginalia.index.query.IndexQueryParams; +import nu.marginalia.index.query.filter.QueryFilterStepIf; public class ParamMatchingQueryFilter implements QueryFilterStepIf { private final IndexQueryParams params; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java b/index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java similarity index 60% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java rename to index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java index e5652faa..8e8bc252 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java +++ b/index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -1,16 +1,14 @@ -package nu.marginalia.wmsa.edge.index.postings.forward; +package nu.marginalia.index.forward; import lombok.SneakyThrows; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.dict.OffHeapDictionaryHashMap; -import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; -import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.wmsa.edge.index.postings.DomainRankings; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; -import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; +import nu.marginalia.dict.OffHeapDictionaryHashMap; +import nu.marginalia.index.journal.model.IndexJournalEntry; +import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; +import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.lexicon.KeywordLexicon; +import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -27,7 +25,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; class ForwardIndexConverterTest { KeywordLexicon keywordLexicon; - SearchIndexJournalWriterImpl writer; + IndexJournalWriter writer; Path indexFile; Path wordsFile1; @@ -47,12 +45,12 @@ class ForwardIndexConverterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18)); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile())); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); indexFile.toFile().deleteOnExit(); - writer = new SearchIndexJournalWriterImpl(keywordLexicon, indexFile.toFile()); + writer = new IndexJournalWriterImpl(keywordLexicon, indexFile); wordsFile1 = Files.createTempFile("words1", ".idx"); urlsFile1 = Files.createTempFile("urls1", ".idx"); @@ -65,12 +63,10 @@ class ForwardIndexConverterTest { keywordLexicon.commitToDisk(); - Thread.sleep(1000); writer.forceWrite(); + writer.close(); - var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)); - docsFileId = dataDir.resolve("docs-i.dat"); docsFileData = dataDir.resolve("docs-d.dat"); } @@ -87,17 +83,16 @@ class ForwardIndexConverterTest { long createId(long url, long domain) { return (domain << 32) | url; } - public void createEntry(SearchIndexJournalWriterImpl writer, KeywordLexicon keywordLexicon, int id) { + public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { int[] factors = getFactorsI(id); - var header = new SearchIndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5); - long[] data = new long[factors.length*2]; - for (int i = 0; i < factors.length; i++) { - data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); - data[2*i + 1] = -factors[i]; + var entryBuilder = IndexJournalEntry.builder(createId(id, id/20), id%5); + + for (int i = 0; i+1 < factors.length; i+=2) { + entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i+1]); } - writer.put(header, new SearchIndexJournalEntry(data)); + writer.put(entryBuilder.build()); } @Test diff --git a/index/index-forward/src/test/java/nu/marginalia/test/TestUtil.java b/index/index-forward/src/test/java/nu/marginalia/test/TestUtil.java new file mode 100644 index 00000000..44a489bb --- /dev/null +++ b/index/index-forward/src/test/java/nu/marginalia/test/TestUtil.java @@ -0,0 +1,50 @@ +package nu.marginalia.test; + + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; + +public class TestUtil { + private static boolean isTempDir(Path dir) { + return dir.startsWith("/tmp") || dir.toString().contains("tmp"); + } + + public static void clearTempDir(Path dir) { + if (!isTempDir(dir)) { + throw new IllegalArgumentException("Refusing to recursively delete directory with that name"); + } + if (Files.isDirectory(dir)) { + for (File f : dir.toFile().listFiles()) { + File[] files = f.listFiles(); + if (files != null) { + Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); + } + System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); + f.delete(); + } + } + System.out.println("Deleting " + dir); + dir.toFile().delete(); + } + + private static String fileSize(Path path) { + try { + long sizeBytes = Files.size(path); + + if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; + if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; + if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; + return sizeBytes + "b"; + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + private static String round(double d) { + return String.format("%.2f", d); + } +} diff --git a/index/index-journal/build.gradle b/index/index-journal/build.gradle new file mode 100644 index 00000000..6ec38550 --- /dev/null +++ b/index/index-journal/build.gradle @@ -0,0 +1,44 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':libraries:array') + implementation project(':common:model') + implementation project(':third-party') + implementation project(':index:lexicon') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.rxjava + implementation libs.trove + implementation libs.zstd + implementation libs.commons.lang3 + implementation libs.roaringbitmap + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java new file mode 100644 index 00000000..dd0b8e1b --- /dev/null +++ b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java @@ -0,0 +1,28 @@ +package nu.marginalia.index.journal.model; + +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.EdgeId; + +public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) { + + public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) { + return new IndexJournalEntryBuilder(documentId, documentMeta); + } + + public static IndexJournalEntryBuilder builder(int domainId, + int urlId, + long documentMeta) { + + + return builder(new EdgeId<>(domainId), new EdgeId<>(urlId), documentMeta); + } + + public static IndexJournalEntryBuilder builder(EdgeId domainId, + EdgeId urlId, + long documentMeta) { + + + return new IndexJournalEntryBuilder(IndexJournalEntryHeader.combineIds(domainId, urlId), documentMeta); + } +} diff --git a/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryBuilder.java b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryBuilder.java new file mode 100644 index 00000000..f5f1fc22 --- /dev/null +++ b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryBuilder.java @@ -0,0 +1,34 @@ +package nu.marginalia.index.journal.model; + +import gnu.trove.list.array.TLongArrayList; + +public class IndexJournalEntryBuilder { + private final long documentId; + private final long documentMeta; + private final TLongArrayList items = new TLongArrayList(); + + public IndexJournalEntryBuilder(long documentId, long documentMeta) { + this.documentId = documentId; + this.documentMeta = documentMeta; + } + + public IndexJournalEntryBuilder capacity(int size) { + items.ensureCapacity(size); + return this; + } + + public IndexJournalEntryBuilder add(long wordId, long metadata) { + + items.add(wordId); + items.add(metadata); + + return this; + } + + public IndexJournalEntry build() { + return new IndexJournalEntry( + new IndexJournalEntryHeader(items.size(), documentId, documentMeta), + new IndexJournalEntryData(items.toArray()) + ); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalEntry.java b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java similarity index 77% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalEntry.java rename to index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java index 6fe28af3..423626ce 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalEntry.java +++ b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java @@ -1,32 +1,33 @@ -package nu.marginalia.wmsa.edge.index.postings.journal.model; +package nu.marginalia.index.journal.model; +import java.io.DataOutputStream; +import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Iterator; -public class SearchIndexJournalEntry implements Iterable { +public class IndexJournalEntryData implements Iterable { private final int size; private final long[] underlyingArray; public static final int MAX_LENGTH = 1000; public static final int ENTRY_SIZE = 2; - public SearchIndexJournalEntry(long[] underlyingArray) { + public IndexJournalEntryData(long[] underlyingArray) { this.size = underlyingArray.length; this.underlyingArray = underlyingArray; } - public SearchIndexJournalEntry(int size, long[] underlyingArray) { + public IndexJournalEntryData(int size, long[] underlyingArray) { this.size = size; this.underlyingArray = underlyingArray; } - public void write(ByteBuffer buffer) { + public void write(DataOutputStream dos) throws IOException { for (int i = 0; i < size; i++) { - buffer.putLong(underlyingArray[i]); + dos.writeLong(underlyingArray[i]); } } - public long get(int idx) { if (idx >= size) throw new ArrayIndexOutOfBoundsException(); diff --git a/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java new file mode 100644 index 00000000..bbc81a17 --- /dev/null +++ b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java @@ -0,0 +1,20 @@ +package nu.marginalia.index.journal.model; + +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.EdgeId; + +public record IndexJournalEntryHeader(int entrySize, long combinedId, long documentMeta) { + + public IndexJournalEntryHeader(EdgeId domainId, EdgeId urlId, long documentMeta) { + this(-1, combineIds(domainId, urlId), documentMeta); + } + + static long combineIds(EdgeId domainId, EdgeId urlId) { + long did = domainId.id(); + long uid = urlId.id(); + + return (did << 32L) | uid; + } + +} diff --git a/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalFileHeader.java b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalFileHeader.java new file mode 100644 index 00000000..42ae60b4 --- /dev/null +++ b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalFileHeader.java @@ -0,0 +1,4 @@ +package nu.marginalia.index.journal.model; + +public record IndexJournalFileHeader(long fileSize, long wordCount) { +} diff --git a/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalStatistics.java b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalStatistics.java new file mode 100644 index 00000000..9eb28473 --- /dev/null +++ b/index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalStatistics.java @@ -0,0 +1,3 @@ +package nu.marginalia.index.journal.model; + +public record IndexJournalStatistics(int highestWord, int documentCardinality) { } diff --git a/index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java b/index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java new file mode 100644 index 00000000..9610dea4 --- /dev/null +++ b/index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java @@ -0,0 +1,60 @@ +package nu.marginalia.index.journal.reader; + +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryHeader; + +import java.io.DataInputStream; +import java.io.IOException; + +public class IndexJournalReadEntry { + public final IndexJournalEntryHeader header; + + private final long[] buffer; + + public IndexJournalReadEntry(IndexJournalEntryHeader header, long[] buffer) { + this.header = header; + this.buffer = buffer; + } + + + public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException { + + final long sizeBlock = inputStream.readLong(); + final long docId = inputStream.readLong(); + final long meta = inputStream.readLong(); + + var header = new IndexJournalEntryHeader( + (int) (sizeBlock >>> 32L), + docId, + meta); + + long[] buffer = new long[header.entrySize()]; + + for (int i = 0; i < header.entrySize(); i++) { + buffer[i] = inputStream.readLong(); + } + + return new IndexJournalReadEntry(header, buffer); + } + + public long docId() { + return header.combinedId(); + } + + public long docMeta() { + return header.documentMeta(); + } + + public int domainId() { + return (int) (docId() >>> 32L); + } + + public int urlId() { + return (int) (docId() & 0xFFFF_FFFFL); + } + + public IndexJournalEntryData readEntry() { + return new IndexJournalEntryData(header.entrySize(), buffer); + } + +} diff --git a/index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java b/index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java new file mode 100644 index 00000000..1467c500 --- /dev/null +++ b/index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java @@ -0,0 +1,48 @@ +package nu.marginalia.index.journal.reader; + +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalFileHeader; +import nu.marginalia.index.journal.model.IndexJournalStatistics; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; +import java.util.Iterator; +import java.util.function.IntConsumer; + +public interface IndexJournalReader extends Iterable { + int FILE_HEADER_SIZE_LONGS = 2; + int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS; + + IndexJournalFileHeader fileHeader(); + + IndexJournalStatistics getStatistics(); + + void forEachWordId(IntConsumer consumer); + + void forEachUrlIdWordId(BiIntConsumer consumer); + + void forEachDocIdWordId(LongIntConsumer consumer); + + void forEachDocIdRecord(LongObjectConsumer consumer); + + void forEachUrlId(IntConsumer consumer); + + @NotNull + @Override + Iterator iterator(); + + void close() throws IOException; + + interface BiIntConsumer { + void accept(int left, int right); + } + + interface LongIntConsumer { + void accept(long left, int right); + } + + interface LongObjectConsumer { + void accept(long left, T right); + } + +} diff --git a/index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderSingleCompressedFile.java b/index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderSingleCompressedFile.java new file mode 100644 index 00000000..6c6bf58b --- /dev/null +++ b/index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderSingleCompressedFile.java @@ -0,0 +1,197 @@ +package nu.marginalia.index.journal.reader; + +import com.github.luben.zstd.ZstdInputStream; +import lombok.SneakyThrows; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalFileHeader; +import nu.marginalia.index.journal.model.IndexJournalStatistics; +import org.jetbrains.annotations.NotNull; +import org.roaringbitmap.longlong.Roaring64Bitmap; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.Iterator; +import java.util.function.IntConsumer; +import java.util.function.Predicate; + +public class IndexJournalReaderSingleCompressedFile implements IndexJournalReader { + + private static Path journalFile; + public final IndexJournalFileHeader fileHeader; + + private DataInputStream dataInputStream = null; + + final Predicate entryPredicate; + final Predicate recordPredicate; + + public IndexJournalReaderSingleCompressedFile(Path file) throws IOException { + fileHeader = readHeader(file); + + this.recordPredicate = null; + this.entryPredicate = null; + } + + public IndexJournalReaderSingleCompressedFile(Path file, Predicate entryPredicate, Predicate recordPredicate) throws IOException { + journalFile = file; + fileHeader = readHeader(file); + + var fileInputStream = Files.newInputStream(file, StandardOpenOption.READ); + fileInputStream.skipNBytes(FILE_HEADER_SIZE_BYTES); + + this.recordPredicate = recordPredicate; + this.entryPredicate = entryPredicate; + } + + private static IndexJournalFileHeader readHeader(Path file) throws IOException { + journalFile = file; + + try (var raf = new RandomAccessFile(file.toFile(), "r")) { + long unused = raf.readLong(); + long wordCount = raf.readLong(); + + return new IndexJournalFileHeader(unused, wordCount); + } + } + + private static DataInputStream createInputStream(Path file) throws IOException { + var fileInputStream = Files.newInputStream(file, StandardOpenOption.READ); + + // skip the header + fileInputStream.skipNBytes(16); + + return new DataInputStream(new ZstdInputStream(fileInputStream)); + } + + public IndexJournalFileHeader fileHeader() { + return fileHeader; + } + + public boolean filter(IndexJournalReadEntry entry) { + return entryPredicate == null || entryPredicate.test(entry); + } + + public boolean filter(IndexJournalReadEntry entry, IndexJournalEntryData.Record record) { + return (entryPredicate == null || entryPredicate.test(entry)) + && (recordPredicate == null || recordPredicate.test(record)); + } + + public void close() throws IOException { + dataInputStream.close(); + } + + + @Override + public IndexJournalStatistics getStatistics() { + int highestWord = 0; + + // Docs cardinality is a candidate for a HyperLogLog + Roaring64Bitmap docsBitmap = new Roaring64Bitmap(); + + for (var entry : this) { + var entryData = entry.readEntry(); + + if (filter(entry)) { + docsBitmap.addLong(entry.docId() & 0x0000_0000_FFFF_FFFFL); + + for (var item : entryData) { + if (filter(entry, item)) { + highestWord = Integer.max(item.wordId(), highestWord); + } + } + } + } + + return new IndexJournalStatistics(highestWord, docsBitmap.getIntCardinality()); + } + + @Override + public void forEachWordId(IntConsumer consumer) { + for (var entry : this) { + var data = entry.readEntry(); + for (var post : data) { + if (filter(entry, post)) { + consumer.accept(post.wordId()); + } + } + } + } + + @Override + public void forEachUrlIdWordId(BiIntConsumer consumer) { + for (var entry : this) { + var data = entry.readEntry(); + + for (var post : data) { + if (filter(entry, post)) { + consumer.accept(entry.urlId(), post.wordId()); + } + } + } + } + + @Override + public void forEachDocIdWordId(LongIntConsumer consumer) { + for (var entry : this) { + var data = entry.readEntry(); + + for (var post : data) { + if (filter(entry, post)) { + consumer.accept(entry.docId(), post.wordId()); + } + } + } + } + + @Override + public void forEachDocIdRecord(LongObjectConsumer consumer) { + for (var entry : this) { + var data = entry.readEntry(); + + for (var post : data) { + if (filter(entry, post)) { + consumer.accept(entry.docId(), post); + } + } + } + } + @Override + public void forEachUrlId(IntConsumer consumer) { + for (var entry : this) { + if (filter(entry)) { + consumer.accept(entry.urlId()); + } + } + } + + @SneakyThrows + @NotNull + @Override + public Iterator iterator() { + if (dataInputStream != null) { + dataInputStream.close(); + } + dataInputStream = createInputStream(journalFile); + + return new JournalEntryIterator(); + } + + private class JournalEntryIterator implements Iterator { + private int i = 0; + + @Override + @SneakyThrows + public boolean hasNext() { + return i < fileHeader.fileSize(); + } + + @SneakyThrows + @Override + public IndexJournalReadEntry next() { + i++; + return IndexJournalReadEntry.read(dataInputStream); + } + } + +} diff --git a/index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriter.java b/index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriter.java new file mode 100644 index 00000000..03b98d52 --- /dev/null +++ b/index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriter.java @@ -0,0 +1,20 @@ +package nu.marginalia.index.journal.writer; + +import nu.marginalia.index.journal.model.IndexJournalEntry; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryHeader; + +import java.io.IOException; + +public interface IndexJournalWriter { + void put(IndexJournalEntryHeader header, IndexJournalEntryData entry); + default void put(IndexJournalEntry entry) { + put(entry.header(), entry.data()); + } + + void forceWrite() throws IOException; + + void flushWords(); + void close() throws IOException; + +} diff --git a/index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java b/index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java new file mode 100644 index 00000000..c9bf44cd --- /dev/null +++ b/index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java @@ -0,0 +1,73 @@ +package nu.marginalia.index.journal.writer; + +import com.github.luben.zstd.ZstdOutputStream; +import lombok.SneakyThrows; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.lexicon.KeywordLexicon; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class IndexJournalWriterImpl implements IndexJournalWriter{ + private final KeywordLexicon lexicon; + private final Path outputFile; + private final DataOutputStream outputStream; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private int numEntries = 0; + + public IndexJournalWriterImpl(KeywordLexicon lexicon, Path outputFile) throws IOException { + this.lexicon = lexicon; + this.outputFile = outputFile; + + var fileStream = Files.newOutputStream(outputFile, StandardOpenOption.CREATE); + + writeHeaderPlaceholder(fileStream); + + outputStream = new DataOutputStream(new ZstdOutputStream(fileStream)); + } + + private static void writeHeaderPlaceholder(OutputStream fileStream) throws IOException { + fileStream.write(new byte[IndexJournalReader.FILE_HEADER_SIZE_BYTES]); + } + + @Override + @SneakyThrows + public void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) { + outputStream.writeInt(entry.size()); + outputStream.writeInt(0); + outputStream.writeLong(header.combinedId()); + outputStream.writeLong(header.documentMeta()); + entry.write(outputStream); + + numEntries++; + } + + @Override + public void forceWrite() throws IOException { + outputStream.flush(); + + try (var raf = new RandomAccessFile(outputFile.toFile(), "rws")) { + raf.writeLong(numEntries); + raf.writeLong(lexicon.size()); + } + } + + @Override + public void flushWords() { + lexicon.commitToDisk(); + } + + public void close() throws IOException { + forceWrite(); + + outputStream.close(); + } +} diff --git a/index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java b/index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java new file mode 100644 index 00000000..67b23dee --- /dev/null +++ b/index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java @@ -0,0 +1,133 @@ +package nu.marginalia.index.journal; + +import nu.marginalia.index.journal.model.IndexJournalEntry; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; +import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; +import nu.marginalia.lexicon.KeywordLexicon; +import org.apache.commons.lang3.tuple.Pair; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class IndexJournalTest { + Path tempFile; + KeywordLexicon lexicon; + IndexJournalReader reader; + + @BeforeEach + public void setUp() throws IOException { + tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); + lexicon = Mockito.mock(KeywordLexicon.class); + + var journalWriter = new IndexJournalWriterImpl(lexicon, tempFile); + journalWriter.put(IndexJournalEntry.builder(44, 10, 55) + .add(1, 2) + .add(2, 3) + .add(3, 4) + .add(5, 6).build()); + + journalWriter.put(IndexJournalEntry.builder(43, 15, 10) + .add(5, 5) + .add(6, 6) + .build()); + journalWriter.forceWrite(); + journalWriter.close(); + + reader = new IndexJournalReaderSingleCompressedFile(tempFile); + } + @AfterEach + public void tearDown() throws IOException { + reader.close(); + Files.delete(tempFile); + } + + @Test + public void reiterable() { + // Verifies that the reader can be run twice to the same effect + + int cnt = 0; + int cnt2 = 0; + + for (var item : reader) cnt++; + for (var item : reader) cnt2++; + + assertEquals(cnt2, cnt); + } + + @Test + public void forEachUrlId() { + List expected = List.of(10, 15); + List actual = new ArrayList<>(); + + reader.forEachUrlId(actual::add); + assertEquals(expected, actual); + } + + @Test + public void forEachWordId() { + List expected = List.of(1, 2, 3, 5, 5 ,6); + List actual = new ArrayList<>(); + + reader.forEachWordId(actual::add); + assertEquals(expected, actual); + } + + + @Test + public void forEachUrlIdWordId() { + List> expected = List.of( + Pair.of(10, 1), + Pair.of(10, 2), + Pair.of(10, 3), + Pair.of(10, 5), + Pair.of(15, 5), + Pair.of(15, 6)); + List> actual = new ArrayList<>(); + + reader.forEachUrlIdWordId((url, word) -> actual.add(Pair.of(url, word))); + assertEquals(expected, actual); + } + + @Test + public void forEachDocIdWordId() { + List> expected = List.of( + Pair.of(10L | (44L << 32), 1), + Pair.of(10L | (44L << 32), 2), + Pair.of(10L | (44L << 32), 3), + Pair.of(10L | (44L << 32), 5), + Pair.of(15L | (43L << 32), 5), + Pair.of(15L | (43L << 32), 6)); + List> actual = new ArrayList<>(); + + reader.forEachDocIdWordId((url, word) -> actual.add(Pair.of(url, word))); + assertEquals(expected, actual); + } + + @Test + public void forEachDocIdRecord() { + List> expected = List.of( + Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(1, 2)), + Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(2, 3)), + Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(3, 4)), + Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(5, 6)), + Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(5, 5)), + Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(6, 6)) + ); + List> actual = new ArrayList<>(); + + reader.forEachDocIdRecord((url, word) -> actual.add(Pair.of(url, word))); + assertEquals(expected, actual); + } + +} diff --git a/index/index-query/build.gradle b/index/index-query/build.gradle new file mode 100644 index 00000000..0558edbe --- /dev/null +++ b/index/index-query/build.gradle @@ -0,0 +1,36 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':libraries:array') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EmptyEntrySource.java b/index/index-query/src/main/java/nu/marginalia/index/query/EmptyEntrySource.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EmptyEntrySource.java rename to index/index-query/src/main/java/nu/marginalia/index/query/EmptyEntrySource.java index f38b4c0d..a1e39069 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EmptyEntrySource.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/EmptyEntrySource.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.index.query; +package nu.marginalia.index.query; -import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.array.buffer.LongQueryBuffer; public class EmptyEntrySource implements EntrySource { @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySource.java b/index/index-query/src/main/java/nu/marginalia/index/query/EntrySource.java similarity index 54% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySource.java rename to index/index-query/src/main/java/nu/marginalia/index/query/EntrySource.java index 5ec62c05..68c09187 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySource.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/EntrySource.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.index.query; +package nu.marginalia.index.query; -import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.array.buffer.LongQueryBuffer; public interface EntrySource { void skip(int n); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySourceFromArrayRange.java b/index/index-query/src/main/java/nu/marginalia/index/query/EntrySourceFromArrayRange.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySourceFromArrayRange.java rename to index/index-query/src/main/java/nu/marginalia/index/query/EntrySourceFromArrayRange.java index a0d9ee32..270df865 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySourceFromArrayRange.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/EntrySourceFromArrayRange.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.index.query; +package nu.marginalia.index.query; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.buffer.LongQueryBuffer; import static java.lang.Math.min; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQuery.java b/index/index-query/src/main/java/nu/marginalia/index/query/IndexQuery.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQuery.java rename to index/index-query/src/main/java/nu/marginalia/index/query/IndexQuery.java index b2994d36..b8df5fc7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQuery.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/IndexQuery.java @@ -1,8 +1,7 @@ -package nu.marginalia.wmsa.edge.index.query; +package nu.marginalia.index.query; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.wmsa.edge.index.query.EntrySource; -import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; +import nu.marginalia.index.query.filter.QueryFilterStepIf; +import nu.marginalia.array.buffer.LongQueryBuffer; import java.util.ArrayList; import java.util.List; @@ -64,6 +63,7 @@ public class IndexQuery { public long dataCost() { return dataCost; } + public String toString() { StringBuilder sb = new StringBuilder(); sb.append("Sources:\n"); diff --git a/index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java b/index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java new file mode 100644 index 00000000..dddc1e3b --- /dev/null +++ b/index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java @@ -0,0 +1,12 @@ +package nu.marginalia.index.query; + +import nu.marginalia.index.query.filter.QueryFilterStepIf; + +public interface IndexQueryBuilder { + IndexQueryBuilder also(int termId); + + IndexQueryBuilder not(int termId); + IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep); + + IndexQuery build(); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java b/index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryParams.java similarity index 61% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java rename to index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryParams.java index 031410fc..1b840815 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryParams.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.index.query; +package nu.marginalia.index.query; -import nu.marginalia.wmsa.edge.index.model.QueryStrategy; -import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet; -import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.searchset.SearchSet; +import nu.marginalia.index.query.limit.SpecificationLimit; public record IndexQueryParams(SpecificationLimit qualityLimit, SpecificationLimit year, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexSearchBudget.java b/index/index-query/src/main/java/nu/marginalia/index/query/IndexSearchBudget.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexSearchBudget.java rename to index/index-query/src/main/java/nu/marginalia/index/query/IndexSearchBudget.java index dfcbf06f..5551839e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexSearchBudget.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/IndexSearchBudget.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.query; +package nu.marginalia.index.query; public class IndexSearchBudget { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterAnyOf.java b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterAnyOf.java rename to index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java index 293fe7d0..2569ec42 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterAnyOf.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.index.query.filter; +package nu.marginalia.index.query.filter; -import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.array.buffer.LongQueryBuffer; import java.util.Arrays; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterLetThrough.java b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterLetThrough.java rename to index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java index 3f471cd8..688ef938 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterLetThrough.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.query.filter; +package nu.marginalia.index.query.filter; public class QueryFilterLetThrough implements QueryFilterStepIf { static final QueryFilterStepIf instance = new QueryFilterLetThrough(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterNoPass.java b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java similarity index 79% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterNoPass.java rename to index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java index 4ad69531..1bcd04ae 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterNoPass.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.index.query.filter; +package nu.marginalia.index.query.filter; -import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.array.buffer.LongQueryBuffer; public class QueryFilterNoPass implements QueryFilterStepIf { static final QueryFilterStepIf instance = new QueryFilterNoPass(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepExcludeFromPredicate.java b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepExcludeFromPredicate.java rename to index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java index 8cb4561f..92c8c972 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepExcludeFromPredicate.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.query.filter; +package nu.marginalia.index.query.filter; import java.util.function.LongPredicate; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepFromPredicate.java b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepFromPredicate.java rename to index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java index 26207152..56f08b71 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepFromPredicate.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.query.filter; +package nu.marginalia.index.query.filter; import java.util.function.LongPredicate; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepIf.java b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterStepIf.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepIf.java rename to index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterStepIf.java index 9af75a7f..e3692538 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepIf.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterStepIf.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.index.query.filter; +package nu.marginalia.index.query.filter; -import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.array.buffer.LongQueryBuffer; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryLimits.java b/index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryLimits.java similarity index 68% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryLimits.java rename to index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryLimits.java index dce78343..b403e5ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryLimits.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryLimits.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.model; +package nu.marginalia.index.query.limit; public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java b/index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryStrategy.java similarity index 76% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java rename to index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryStrategy.java index d8682a61..e7f69252 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryStrategy.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.model; +package nu.marginalia.index.query.limit; public enum QueryStrategy { SENTENCE, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimit.java b/index/index-query/src/main/java/nu/marginalia/index/query/limit/SpecificationLimit.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimit.java rename to index/index-query/src/main/java/nu/marginalia/index/query/limit/SpecificationLimit.java index 5a9a587b..1af0b10a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimit.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/limit/SpecificationLimit.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.search.domain; +package nu.marginalia.index.query.limit; public record SpecificationLimit(SpecificationLimitType type, int value) { public static SpecificationLimit none() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimitType.java b/index/index-query/src/main/java/nu/marginalia/index/query/limit/SpecificationLimitType.java similarity index 63% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimitType.java rename to index/index-query/src/main/java/nu/marginalia/index/query/limit/SpecificationLimitType.java index 24c2fd12..a47675c7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimitType.java +++ b/index/index-query/src/main/java/nu/marginalia/index/query/limit/SpecificationLimitType.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.model.search.domain; +package nu.marginalia.index.query.limit; public enum SpecificationLimitType { NONE, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSet.java b/index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java similarity index 55% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSet.java rename to index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java index 8f412374..0cc40e4d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSet.java +++ b/index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.svc.searchset; +package nu.marginalia.index.searchset; public interface SearchSet { boolean contains(int urlId); diff --git a/index/index-reverse/build.gradle b/index/index-reverse/build.gradle new file mode 100644 index 00000000..a719b5f4 --- /dev/null +++ b/index/index-reverse/build.gradle @@ -0,0 +1,43 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':libraries:array') + implementation project(':libraries:btree') + implementation project(':libraries:misc') + implementation project(':features:domain-ranking') + implementation project(':index:index-query') + implementation project(':index:index-journal') + implementation project(':index:lexicon') + implementation project(':common:model') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverter.java b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexConverter.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverter.java rename to index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexConverter.java index 2cf79112..0a9005fe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverter.java +++ b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexConverter.java @@ -1,18 +1,18 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse; +package nu.marginalia.index.reverse; +import lombok.SneakyThrows; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalStatistics; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.ranking.DomainRankings; import nu.marginalia.util.RandomWriteFunnel; -import nu.marginalia.util.array.IntArray; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.array.algo.SortingContext; -import nu.marginalia.util.array.functional.LongBinaryIOOperation; -import nu.marginalia.util.array.functional.LongIOTransformer; -import nu.marginalia.util.array.functional.LongTransformer; -import nu.marginalia.util.btree.BTreeWriter; -import nu.marginalia.wmsa.edge.index.postings.DomainRankings; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; +import nu.marginalia.array.IntArray; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.algo.SortingContext; +import nu.marginalia.array.functional.LongBinaryIOOperation; +import nu.marginalia.array.functional.LongIOTransformer; +import nu.marginalia.array.functional.LongTransformer; +import nu.marginalia.btree.BTreeWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -22,9 +22,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; -import static nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexParameters.ENTRY_SIZE; -import static nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexParameters.bTreeContext; - public class ReverseIndexConverter { private static final int RWF_BIN_SIZE = 10_000_000; @@ -32,14 +29,14 @@ public class ReverseIndexConverter { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final SearchIndexJournalReaderSingleFile journalReader; + private final IndexJournalReader journalReader; private final DomainRankings domainRankings; private final Path outputFileWords; private final Path outputFileDocs; private final SortingContext sortingContext; public ReverseIndexConverter(Path tmpFileDir, - SearchIndexJournalReaderSingleFile journalReader, + IndexJournalReader journalReader, DomainRankings domainRankings, Path outputFileWords, Path outputFileDocs) { @@ -54,11 +51,12 @@ public class ReverseIndexConverter { public void convert() throws IOException { deleteOldFiles(); - if (journalReader.fileHeader.fileSize() <= SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES) { + if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) { + logger.warn("Bailing: Journal is empty!"); return; } - final SearchIndexJournalStatistics statistics = journalReader.getStatistics(); + final IndexJournalStatistics statistics = journalReader.getStatistics(); final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); @@ -97,7 +95,7 @@ public class ReverseIndexConverter { { LongArray intermediateDocs = LongArray.mmapForWriting(intermediateUrlsFile); wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> { - intermediateDocs.sortLargeSpanN(sortingContext, ENTRY_SIZE, s, e); + intermediateDocs.sortLargeSpanN(sortingContext, ReverseIndexParameters.ENTRY_SIZE, s, e); return e; }); intermediateDocs.force(); @@ -138,7 +136,7 @@ public class ReverseIndexConverter { public long apply(long start, long end) throws IOException { if (end == start) return end; - size += bTreeContext.calculateSize((int) (end - start) / ENTRY_SIZE); + size += ReverseIndexParameters.bTreeContext.calculateSize((int) (end - start) / ReverseIndexParameters.ENTRY_SIZE); return end; } @@ -154,7 +152,7 @@ public class ReverseIndexConverter { @Override public long transform(long pos, long count) { - return (offset += ENTRY_SIZE * count); + return (offset += ReverseIndexParameters.ENTRY_SIZE * count); } } @@ -192,7 +190,7 @@ public class ReverseIndexConverter { } } - private class IntermediateIndexConstructor implements SearchIndexJournalReaderSingleFile.LongObjectConsumer, AutoCloseable { + private class IntermediateIndexConstructor implements IndexJournalReader.LongObjectConsumer, AutoCloseable { private final LongArray wordRangeEnds; private final IntArray wordRangeOffset; @@ -208,8 +206,9 @@ public class ReverseIndexConverter { this.documentsFile = documentsFile; } + @SneakyThrows @Override - public void accept(long docId, SearchIndexJournalEntry.Record record) { + public void accept(long docId, IndexJournalEntryData.Record record) { /* Encode the ID as * diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexParameters.java b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexParameters.java similarity index 60% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexParameters.java rename to index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexParameters.java index e38fa3b5..1ee6578c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexParameters.java +++ b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexParameters.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse; +package nu.marginalia.index.reverse; -import nu.marginalia.util.btree.model.BTreeContext; +import nu.marginalia.btree.model.BTreeContext; class ReverseIndexParameters { public static final int ENTRY_SIZE = 2; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrefixEntrySource.java b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPrefixEntrySource.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrefixEntrySource.java rename to index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPrefixEntrySource.java index 0cae45ab..359e91cc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrefixEntrySource.java +++ b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPrefixEntrySource.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse; +package nu.marginalia.index.reverse; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.wmsa.edge.index.query.EntrySource; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.btree.BTreeReader; +import nu.marginalia.index.query.EntrySource; import static java.lang.Math.min; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrioReader.java b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPrioReader.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrioReader.java rename to index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPrioReader.java index b2ce74dc..2a38a9dd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrioReader.java +++ b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPrioReader.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse; +package nu.marginalia.index.reverse; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySource; -import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySourceBehavior; -import nu.marginalia.wmsa.edge.index.query.EmptyEntrySource; -import nu.marginalia.wmsa.edge.index.query.EntrySource; +import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior; +import nu.marginalia.index.reverse.query.ReverseIndexEntrySource; +import nu.marginalia.index.query.EntrySource; +import nu.marginalia.array.LongArray; +import nu.marginalia.btree.BTreeReader; +import nu.marginalia.index.query.EmptyEntrySource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPriorityParameters.java b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPriorityParameters.java similarity index 62% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPriorityParameters.java rename to index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPriorityParameters.java index b6d6fb38..60b5cf80 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPriorityParameters.java +++ b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPriorityParameters.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse; +package nu.marginalia.index.reverse; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.model.crawl.EdgePageWordFlags; public class ReverseIndexPriorityParameters { private static final long highPriorityFlags = EdgePageWordFlags.Title.asBit() @@ -11,7 +11,7 @@ public class ReverseIndexPriorityParameters { | EdgePageWordFlags.Site.asBit() | EdgePageWordFlags.SiteAdjacent.asBit(); - public static boolean filterPriorityRecord(SearchIndexJournalEntry.Record record) { + public static boolean filterPriorityRecord(IndexJournalEntryData.Record record) { long meta = record.metadata(); return (meta & highPriorityFlags) != 0; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexReader.java similarity index 77% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java rename to index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexReader.java index 6f4475e7..7c078e9c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java +++ b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexReader.java @@ -1,16 +1,16 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse; +package nu.marginalia.index.reverse; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySource; -import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySourceBehavior; -import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexRejectFilter; -import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexRetainFilter; -import nu.marginalia.wmsa.edge.index.query.EmptyEntrySource; -import nu.marginalia.wmsa.edge.index.query.EntrySource; -import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterLetThrough; -import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterNoPass; -import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; +import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior; +import nu.marginalia.index.reverse.query.ReverseIndexEntrySource; +import nu.marginalia.index.reverse.query.ReverseIndexRejectFilter; +import nu.marginalia.index.reverse.query.ReverseIndexRetainFilter; +import nu.marginalia.array.LongArray; +import nu.marginalia.btree.BTreeReader; +import nu.marginalia.index.query.EmptyEntrySource; +import nu.marginalia.index.query.EntrySource; +import nu.marginalia.index.query.filter.QueryFilterLetThrough; +import nu.marginalia.index.query.filter.QueryFilterNoPass; +import nu.marginalia.index.query.filter.QueryFilterStepIf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySource.java b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySource.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySource.java rename to index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySource.java index a47b134a..13cdc3af 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySource.java +++ b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySource.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse.query; +package nu.marginalia.index.reverse.query; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.wmsa.edge.index.query.EntrySource; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.btree.BTreeReader; +import nu.marginalia.index.query.EntrySource; import static java.lang.Math.min; diff --git a/index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySourceBehavior.java b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySourceBehavior.java new file mode 100644 index 00000000..67058fed --- /dev/null +++ b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySourceBehavior.java @@ -0,0 +1,11 @@ +package nu.marginalia.index.reverse.query; + +public enum ReverseIndexEntrySourceBehavior { + /** Eagerly read from this entry source */ + DO_PREFER, + + /** Do not use this entry source if entries have been fetched + * from another entry source + */ + DO_NOT_PREFER +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRejectFilter.java b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRejectFilter.java similarity index 68% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRejectFilter.java rename to index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRejectFilter.java index ca317349..0ad4112f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRejectFilter.java +++ b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRejectFilter.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse.query; +package nu.marginalia.index.reverse.query; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.btree.BTreeReader; +import nu.marginalia.index.query.filter.QueryFilterStepIf; public record ReverseIndexRejectFilter(BTreeReader range) implements QueryFilterStepIf { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRetainFilter.java b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRetainFilter.java similarity index 68% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRetainFilter.java rename to index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRetainFilter.java index 9c408a34..a9a14dad 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRetainFilter.java +++ b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRetainFilter.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse.query; +package nu.marginalia.index.reverse.query; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.btree.BTreeReader; +import nu.marginalia.index.query.filter.QueryFilterStepIf; public record ReverseIndexRetainFilter(BTreeReader range) implements QueryFilterStepIf { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java b/index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest.java similarity index 67% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java rename to index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest.java index 32fcb58b..a50bae9c 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java +++ b/index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest.java @@ -1,19 +1,18 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse; +package nu.marginalia.index.reverse; import lombok.SneakyThrows; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.OffHeapDictionaryHashMap; -import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; -import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.index.postings.DomainRankings; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; -import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySourceBehavior; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.dict.OffHeapDictionaryHashMap; +import nu.marginalia.index.journal.model.IndexJournalEntry; +import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; +import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; +import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior; +import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.lexicon.KeywordLexicon; +import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; +import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.slf4j.Logger; @@ -29,7 +28,6 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals; class ReverseIndexConverterTest { KeywordLexicon keywordLexicon; - SearchIndexJournalWriterImpl writer; Path indexFile; Path wordsFile1; @@ -44,40 +42,38 @@ class ReverseIndexConverterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<16)); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile())); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); indexFile.toFile().deleteOnExit(); - writer = new SearchIndexJournalWriterImpl(keywordLexicon, indexFile.toFile()); + wordsFile1 = Files.createTempFile("words1", ".idx"); urlsFile1 = Files.createTempFile("urls1", ".idx"); } - public void createEntry(SearchIndexJournalWriterImpl writer, KeywordLexicon keywordLexicon, int id) { + public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); - var header = new SearchIndexJournalEntryHeader(factors.length, id, EdgePageDocumentsMetadata.defaultValue()); - long[] data = new long[factors.length*2]; + var entryBuilder = IndexJournalEntry.builder(id, EdgePageDocumentsMetadata.defaultValue()); + for (int i = 0; i < factors.length; i++) { - data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); - data[2*i + 1] = factors[i]; + entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i]); } - writer.put(header, new SearchIndexJournalEntry(data)); + writer.put(entryBuilder.build()); } @Test - void testReverseIndex() throws IOException, InterruptedException { + void testReverseIndex() throws IOException { + var writer = new IndexJournalWriterImpl(keywordLexicon, indexFile); + for (int i = 1; i < 512; i++) { createEntry(writer, keywordLexicon, i); } - - keywordLexicon.commitToDisk(); - Thread.sleep(1000); - writer.forceWrite(); + writer.close(); Path tmpDir = Path.of("/tmp"); @@ -85,7 +81,7 @@ class ReverseIndexConverterTest { var wordsFile = dataDir.resolve("urls.dat"); var docsFile = dataDir.resolve("docs.dat"); - var journalReader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)); + var journalReader = new IndexJournalReaderSingleCompressedFile(indexFile); new ReverseIndexConverter(tmpDir, journalReader, new DomainRankings(), wordsFile, docsFile) .convert(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java b/index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest2.java similarity index 67% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java rename to index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest2.java index 2525d39b..bdd08524 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java +++ b/index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest2.java @@ -1,18 +1,18 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse; +package nu.marginalia.index.reverse; import lombok.SneakyThrows; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.OffHeapDictionaryHashMap; -import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; -import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.wmsa.edge.index.postings.DomainRankings; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; -import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySourceBehavior; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.dict.OffHeapDictionaryHashMap; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; +import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; +import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior; +import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.lexicon.KeywordLexicon; +import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -29,7 +29,7 @@ import java.util.stream.LongStream; class ReverseIndexConverterTest2 { KeywordLexicon keywordLexicon; - SearchIndexJournalWriterImpl writer; + IndexJournalWriter writer; Path indexFile; Path wordsFile1; @@ -51,12 +51,12 @@ class ReverseIndexConverterTest2 { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18)); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile())); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); indexFile.toFile().deleteOnExit(); - writer = new SearchIndexJournalWriterImpl(keywordLexicon, indexFile.toFile()); + writer = new IndexJournalWriterImpl(keywordLexicon, indexFile); wordsFile1 = Files.createTempFile("words1", ".idx"); urlsFile1 = Files.createTempFile("urls1", ".idx"); @@ -76,7 +76,7 @@ class ReverseIndexConverterTest2 { Thread.sleep(1000); writer.forceWrite(); - var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)); + var reader = new IndexJournalReaderSingleCompressedFile(indexFile); wordsFile = dataDir.resolve("words.dat"); docsFile = dataDir.resolve("docs.dat"); @@ -97,9 +97,9 @@ class ReverseIndexConverterTest2 { long createId(long url, long domain) { return (domain << 32) | url; } - public void createEntry(SearchIndexJournalWriterImpl writer, KeywordLexicon keywordLexicon, int id) { + public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { int[] factors = getFactorsI(id); - var header = new SearchIndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5); + var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { @@ -107,7 +107,7 @@ class ReverseIndexConverterTest2 { data[2*i + 1] = (i % 21 != 0) ? 0 : -factors[i]; } - writer.put(header, new SearchIndexJournalEntry(data)); + writer.put(header, new IndexJournalEntryData(data)); } @Test @@ -115,7 +115,7 @@ class ReverseIndexConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexConverter(tmpDir, new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)), new DomainRankings(), wordsFile, docsFile).convert(); + new ReverseIndexConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexReader(wordsFile, docsFile); @@ -140,7 +140,7 @@ class ReverseIndexConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexConverter(tmpDir, new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile), null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); + new ReverseIndexConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexReader(wordsFile, docsFile); diff --git a/index/index-reverse/src/test/java/nu/marginalia/test/TestUtil.java b/index/index-reverse/src/test/java/nu/marginalia/test/TestUtil.java new file mode 100644 index 00000000..44a489bb --- /dev/null +++ b/index/index-reverse/src/test/java/nu/marginalia/test/TestUtil.java @@ -0,0 +1,50 @@ +package nu.marginalia.test; + + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; + +public class TestUtil { + private static boolean isTempDir(Path dir) { + return dir.startsWith("/tmp") || dir.toString().contains("tmp"); + } + + public static void clearTempDir(Path dir) { + if (!isTempDir(dir)) { + throw new IllegalArgumentException("Refusing to recursively delete directory with that name"); + } + if (Files.isDirectory(dir)) { + for (File f : dir.toFile().listFiles()) { + File[] files = f.listFiles(); + if (files != null) { + Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); + } + System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); + f.delete(); + } + } + System.out.println("Deleting " + dir); + dir.toFile().delete(); + } + + private static String fileSize(Path path) { + try { + long sizeBytes = Files.size(path); + + if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; + if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; + if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; + return sizeBytes + "b"; + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + private static String round(double d) { + return String.format("%.2f", d); + } +} diff --git a/index/lexicon/build.gradle b/index/lexicon/build.gradle new file mode 100644 index 00000000..68646f95 --- /dev/null +++ b/index/lexicon/build.gradle @@ -0,0 +1,38 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':libraries:misc') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.guava + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java rename to index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java index 5f02cb98..40f9d73b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java +++ b/index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.index.lexicon; +package nu.marginalia.lexicon; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import io.prometheus.client.Gauge; import lombok.SneakyThrows; -import nu.marginalia.util.dict.DictionaryMap; -import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.dict.DictionaryMap; +import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,15 +30,15 @@ public class KeywordLexicon implements AutoCloseable { private final KeywordLexiconJournal journal; @SneakyThrows - public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryMap reverseIndexHashMap) { + public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal) { journal = keywordLexiconJournal; - reverseIndex = reverseIndexHashMap; + reverseIndex = DictionaryMap.create(); logger.info("Creating dictionary writer"); if (!instances.compareAndSet(0, 1)) { - logger.error("MULTIPLE WRITER INSTANCES!"); + logger.error("MULTIPLE LEXICON INSTANCES!"); } journal.loadFile(bytes -> reverseIndex.put(hashFunction.hashBytes(bytes).padToLong())); @@ -60,10 +60,17 @@ public class KeywordLexicon implements AutoCloseable { final long key = hashFunction.hashBytes(bytes).padToLong(); int idx = getReadOnly(key); - if (idx >= 0) - return idx; + if (idx < 0) { + idx = insertNew(key, bytes); + } + + return idx; + } + + private int insertNew(long key, byte[] bytes) throws InterruptedException { Lock lock = memoryLock.writeLock(); + int idx; try { lock.lock(); @@ -111,7 +118,7 @@ public class KeywordLexicon implements AutoCloseable { @Override public void close() throws Exception { - logger.warn("Closing DictionaryWriter"); + logger.warn("Closing Lexicon"); journal.close(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexiconReadOnlyView.java b/index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexiconReadOnlyView.java rename to index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java index 485bb423..9cdef151 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexiconReadOnlyView.java +++ b/index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.lexicon; +package nu.marginalia.lexicon; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java b/index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java rename to index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java index c226c1e6..84a23247 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java +++ b/index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.lexicon.journal; +package nu.marginalia.lexicon.journal; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java b/index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java rename to index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java index 67d4043a..7c6a460f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java +++ b/index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.lexicon.journal; +package nu.marginalia.lexicon.journal; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -7,7 +7,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; -public class KeywordLexiconJournalCommitQueue { +class KeywordLexiconJournalCommitQueue { private final ArrayList commitQueue = new ArrayList<>(10_000); private final Logger logger = LoggerFactory.getLogger(getClass()); private static final long BACK_PRESSURE_LIMIT = 25_000; @@ -25,15 +25,19 @@ public class KeywordLexiconJournalCommitQueue { public synchronized List getQueuedEntries() { - if (commitQueue.isEmpty()) + List data; + if (commitQueue.isEmpty()) { return Collections.emptyList(); - var data = new ArrayList<>(commitQueue); - commitQueue.clear(); + } + else { + data = new ArrayList<>(commitQueue); + commitQueue.clear(); + } notifyAll(); if (data.size() > BACK_PRESSURE_LIMIT) { - logger.warn("Dictionary Journal Backpressure: {}", data.size()); + logger.warn("Lexicon Journal Backpressure: {}", data.size()); } return data; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java b/index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java rename to index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java index 24ca03b1..7473e4df 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java +++ b/index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.lexicon.journal; +package nu.marginalia.lexicon.journal; import lombok.SneakyThrows; import org.slf4j.Logger; @@ -22,20 +22,6 @@ public class KeywordLexiconJournalFile implements AutoCloseable { private final ReadWriteLock diskLock = new ReentrantReadWriteLock(); - @SneakyThrows - public static void main(String... args) { - if (args.length != 1) { - System.err.println("Dumps lexicon content to stdout"); - System.err.println("Arguments: filename"); - return; - } - - KeywordLexiconJournalFile lf = new KeywordLexiconJournalFile(new File(args[0])); - lf.loadFile(bytes -> { - System.out.println(new String(bytes)); - }); - } - public KeywordLexiconJournalFile(File journalFile) throws IOException { this.journalFileRAF = new RandomAccessFile(journalFile, "rw"); this.journalFile = journalFile; diff --git a/index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java b/index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java new file mode 100644 index 00000000..ca044e5e --- /dev/null +++ b/index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java @@ -0,0 +1,77 @@ +package nu.marginalia.lexicon; + +import nu.marginalia.dict.OnHeapDictionaryMap; +import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; + +public class KeywordLexiconTest { + + private Path journalFile; + private KeywordLexicon lexicon; + + @BeforeEach + public void setUp() throws IOException { + journalFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); + + var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile()); + lexicon = new KeywordLexicon(lexiconJournal); + } + + @AfterEach + public void tearDown() throws Exception { + lexicon.close(); + Files.delete(journalFile); + } + + @Test + public void testConsistentInserts() { + int a = lexicon.getOrInsert("aaa"); + int b = lexicon.getOrInsert("bbb"); + int a2 = lexicon.getOrInsert("aaa"); + int c = lexicon.getOrInsert("ccc"); + + assertEquals(a, a2); + assertNotEquals(a, b); + assertNotEquals(a, c); + assertNotEquals(b, c); + } + + @Test + public void testInsertReplay() { + int a = lexicon.getOrInsert("aaa"); + int b = lexicon.getOrInsert("bbb"); + int c = lexicon.getOrInsert("ccc"); + + assertEquals(a, lexicon.getReadOnly("aaa")); + assertEquals(b, lexicon.getReadOnly("bbb")); + assertEquals(c, lexicon.getReadOnly("ccc")); + } + + @Test + public void testReload() throws IOException { + int a = lexicon.getOrInsert("aaa"); + int b = lexicon.getOrInsert("bbb"); + int c = lexicon.getOrInsert("ccc"); + lexicon.commitToDisk(); + + var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile()); + try (var anotherLexicon = new KeywordLexicon(lexiconJournal)) { + assertEquals(a, anotherLexicon.getReadOnly("aaa")); + assertEquals(b, anotherLexicon.getReadOnly("bbb")); + assertEquals(c, anotherLexicon.getReadOnly("ccc")); + } + catch (Exception ex) { + Assertions.fail("???", ex); + } + } +} diff --git a/index/readme.md b/index/readme.md new file mode 100644 index 00000000..dce903d2 --- /dev/null +++ b/index/readme.md @@ -0,0 +1,17 @@ +# Index + +These are components that offer functionality for the [index-service](../services-core/index-service). + +## Indexes + +There are two indexes with accompanying tools for constructing them. + +* [index-forward](index-forward/) is the `document->word` index containing metadata +about each word, such as its position. +* [index-reverse](index-reverse/) is the `word->document` index. + +These indices rely heavily on the [libraries/btree](../libraries/btree) and [libraries/btree](../libraries/array) components. +# Libraries +* [index-query](index-query/) contains structures for evaluating search queries. +* [index-journal](index-journal/) contains tools for writing and reading index data. +* [lexicon](lexicon/) contains a mapping between words' string representation and an unique integer identifier. \ No newline at end of file diff --git a/libraries/array/build.gradle b/libraries/array/build.gradle new file mode 100644 index 00000000..65574dfd --- /dev/null +++ b/libraries/array/build.gradle @@ -0,0 +1,30 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.notnull + implementation libs.commons.lang3 + implementation libs.fastutil + implementation libs.lz4 + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/IntArray.java b/libraries/array/src/main/java/nu/marginalia/array/IntArray.java similarity index 78% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/IntArray.java rename to libraries/array/src/main/java/nu/marginalia/array/IntArray.java index f86d536f..7f882164 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/IntArray.java +++ b/libraries/array/src/main/java/nu/marginalia/array/IntArray.java @@ -1,14 +1,14 @@ -package nu.marginalia.util.array; +package nu.marginalia.array; import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.array.algo.IntArrayBase; -import nu.marginalia.util.array.algo.IntArraySearch; -import nu.marginalia.util.array.algo.IntArraySort; -import nu.marginalia.util.array.algo.IntArrayTransformations; -import nu.marginalia.util.array.delegate.ShiftedIntArray; -import nu.marginalia.util.array.page.IntArrayPage; -import nu.marginalia.util.array.page.PagingIntArray; -import nu.marginalia.util.array.scheme.ArrayPartitioningScheme; +import nu.marginalia.array.algo.IntArrayBase; +import nu.marginalia.array.algo.IntArraySearch; +import nu.marginalia.array.algo.IntArraySort; +import nu.marginalia.array.algo.IntArrayTransformations; +import nu.marginalia.array.delegate.ShiftedIntArray; +import nu.marginalia.array.page.IntArrayPage; +import nu.marginalia.array.page.PagingIntArray; +import nu.marginalia.array.scheme.ArrayPartitioningScheme; import java.io.IOException; import java.nio.file.Files; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/LongArray.java b/libraries/array/src/main/java/nu/marginalia/array/LongArray.java similarity index 78% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/LongArray.java rename to libraries/array/src/main/java/nu/marginalia/array/LongArray.java index 82543f4a..82a43bb1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/LongArray.java +++ b/libraries/array/src/main/java/nu/marginalia/array/LongArray.java @@ -1,14 +1,14 @@ -package nu.marginalia.util.array; +package nu.marginalia.array; import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.array.algo.LongArrayBase; -import nu.marginalia.util.array.algo.LongArraySearch; -import nu.marginalia.util.array.algo.LongArraySort; -import nu.marginalia.util.array.algo.LongArrayTransformations; -import nu.marginalia.util.array.delegate.ShiftedLongArray; -import nu.marginalia.util.array.page.LongArrayPage; -import nu.marginalia.util.array.page.PagingLongArray; -import nu.marginalia.util.array.scheme.ArrayPartitioningScheme; +import nu.marginalia.array.algo.LongArrayBase; +import nu.marginalia.array.algo.LongArraySearch; +import nu.marginalia.array.algo.LongArraySort; +import nu.marginalia.array.algo.LongArrayTransformations; +import nu.marginalia.array.delegate.ShiftedLongArray; +import nu.marginalia.array.page.LongArrayPage; +import nu.marginalia.array.page.PagingLongArray; +import nu.marginalia.array.scheme.ArrayPartitioningScheme; import java.io.IOException; import java.nio.file.Files; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java b/libraries/array/src/main/java/nu/marginalia/array/algo/BulkTransferArray.java similarity index 83% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java rename to libraries/array/src/main/java/nu/marginalia/array/algo/BulkTransferArray.java index bf0df57d..00e8f44f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java +++ b/libraries/array/src/main/java/nu/marginalia/array/algo/BulkTransferArray.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; public interface BulkTransferArray { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayBase.java b/libraries/array/src/main/java/nu/marginalia/array/algo/IntArrayBase.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayBase.java rename to libraries/array/src/main/java/nu/marginalia/array/algo/IntArrayBase.java index 94e462b7..e487cbbb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayBase.java +++ b/libraries/array/src/main/java/nu/marginalia/array/algo/IntArrayBase.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; import java.io.IOException; import java.nio.IntBuffer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySearch.java b/libraries/array/src/main/java/nu/marginalia/array/algo/IntArraySearch.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySearch.java rename to libraries/array/src/main/java/nu/marginalia/array/algo/IntArraySearch.java index 104c5800..e4f925af 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySearch.java +++ b/libraries/array/src/main/java/nu/marginalia/array/algo/IntArraySearch.java @@ -1,8 +1,6 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; -import nu.marginalia.util.array.buffer.IntQueryBuffer; - -import static nu.marginalia.util.array.algo.LongArraySearch.encodeSearchMiss; +import nu.marginalia.array.buffer.IntQueryBuffer; public interface IntArraySearch extends IntArrayBase { @@ -18,7 +16,7 @@ public interface IntArraySearch extends IntArrayBase { if (val > key) break; } - return encodeSearchMiss(pos - 1); + return LongArraySearch.encodeSearchMiss(pos - 1); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySort.java b/libraries/array/src/main/java/nu/marginalia/array/algo/IntArraySort.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySort.java rename to libraries/array/src/main/java/nu/marginalia/array/algo/IntArraySort.java index e6ba6c87..c965b229 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySort.java +++ b/libraries/array/src/main/java/nu/marginalia/array/algo/IntArraySort.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; import java.io.IOException; import java.nio.IntBuffer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayTransformations.java b/libraries/array/src/main/java/nu/marginalia/array/algo/IntArrayTransformations.java similarity index 77% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayTransformations.java rename to libraries/array/src/main/java/nu/marginalia/array/algo/IntArrayTransformations.java index c087f60e..85125d4f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayTransformations.java +++ b/libraries/array/src/main/java/nu/marginalia/array/algo/IntArrayTransformations.java @@ -1,9 +1,9 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; -import nu.marginalia.util.array.functional.IntBinaryIOOperation; -import nu.marginalia.util.array.functional.IntIOTransformer; -import nu.marginalia.util.array.functional.IntTransformer; -import nu.marginalia.util.array.functional.LongIntConsumer; +import nu.marginalia.array.functional.IntBinaryIOOperation; +import nu.marginalia.array.functional.IntIOTransformer; +import nu.marginalia.array.functional.IntTransformer; +import nu.marginalia.array.functional.LongIntConsumer; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java b/libraries/array/src/main/java/nu/marginalia/array/algo/LongArrayBase.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java rename to libraries/array/src/main/java/nu/marginalia/array/algo/LongArrayBase.java index 216e089b..d51bf107 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java +++ b/libraries/array/src/main/java/nu/marginalia/array/algo/LongArrayBase.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; import java.io.IOException; import java.nio.LongBuffer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySearch.java b/libraries/array/src/main/java/nu/marginalia/array/algo/LongArraySearch.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySearch.java rename to libraries/array/src/main/java/nu/marginalia/array/algo/LongArraySearch.java index 2f2579b2..d9020a6e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySearch.java +++ b/libraries/array/src/main/java/nu/marginalia/array/algo/LongArraySearch.java @@ -1,6 +1,6 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; -import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.array.buffer.LongQueryBuffer; public interface LongArraySearch extends LongArrayBase { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySort.java b/libraries/array/src/main/java/nu/marginalia/array/algo/LongArraySort.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySort.java rename to libraries/array/src/main/java/nu/marginalia/array/algo/LongArraySort.java index 5c1fb10f..3ca5987f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySort.java +++ b/libraries/array/src/main/java/nu/marginalia/array/algo/LongArraySort.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; import java.io.IOException; import java.nio.LongBuffer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayTransformations.java b/libraries/array/src/main/java/nu/marginalia/array/algo/LongArrayTransformations.java similarity index 77% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayTransformations.java rename to libraries/array/src/main/java/nu/marginalia/array/algo/LongArrayTransformations.java index 3ff4b82f..7d59466a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayTransformations.java +++ b/libraries/array/src/main/java/nu/marginalia/array/algo/LongArrayTransformations.java @@ -1,9 +1,9 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; -import nu.marginalia.util.array.functional.LongBinaryIOOperation; -import nu.marginalia.util.array.functional.LongIOTransformer; -import nu.marginalia.util.array.functional.LongLongConsumer; -import nu.marginalia.util.array.functional.LongTransformer; +import nu.marginalia.array.functional.LongBinaryIOOperation; +import nu.marginalia.array.functional.LongIOTransformer; +import nu.marginalia.array.functional.LongLongConsumer; +import nu.marginalia.array.functional.LongTransformer; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/SortingContext.java b/libraries/array/src/main/java/nu/marginalia/array/algo/SortingContext.java similarity index 71% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/algo/SortingContext.java rename to libraries/array/src/main/java/nu/marginalia/array/algo/SortingContext.java index 0bd436fb..aae1ed04 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/SortingContext.java +++ b/libraries/array/src/main/java/nu/marginalia/array/algo/SortingContext.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; import java.nio.file.Path; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/IntQueryBuffer.java b/libraries/array/src/main/java/nu/marginalia/array/buffer/IntQueryBuffer.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/IntQueryBuffer.java rename to libraries/array/src/main/java/nu/marginalia/array/buffer/IntQueryBuffer.java index 75d829cd..bd88ccc0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/IntQueryBuffer.java +++ b/libraries/array/src/main/java/nu/marginalia/array/buffer/IntQueryBuffer.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.buffer; +package nu.marginalia.array.buffer; import java.util.Arrays; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/LongQueryBuffer.java b/libraries/array/src/main/java/nu/marginalia/array/buffer/LongQueryBuffer.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/LongQueryBuffer.java rename to libraries/array/src/main/java/nu/marginalia/array/buffer/LongQueryBuffer.java index 6bea19ff..ed0f36fb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/LongQueryBuffer.java +++ b/libraries/array/src/main/java/nu/marginalia/array/buffer/LongQueryBuffer.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.buffer; +package nu.marginalia.array.buffer; import java.util.Arrays; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplIntArrayDelegate.java b/libraries/array/src/main/java/nu/marginalia/array/delegate/ReferenceImplIntArrayDelegate.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplIntArrayDelegate.java rename to libraries/array/src/main/java/nu/marginalia/array/delegate/ReferenceImplIntArrayDelegate.java index bdb4eccb..ed2767e5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplIntArrayDelegate.java +++ b/libraries/array/src/main/java/nu/marginalia/array/delegate/ReferenceImplIntArrayDelegate.java @@ -1,7 +1,7 @@ -package nu.marginalia.util.array.delegate; +package nu.marginalia.array.delegate; import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.array.IntArray; +import nu.marginalia.array.IntArray; import java.io.IOException; import java.nio.channels.FileChannel; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplLongArrayDelegate.java b/libraries/array/src/main/java/nu/marginalia/array/delegate/ReferenceImplLongArrayDelegate.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplLongArrayDelegate.java rename to libraries/array/src/main/java/nu/marginalia/array/delegate/ReferenceImplLongArrayDelegate.java index f33f565c..15d8761f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplLongArrayDelegate.java +++ b/libraries/array/src/main/java/nu/marginalia/array/delegate/ReferenceImplLongArrayDelegate.java @@ -1,7 +1,7 @@ -package nu.marginalia.util.array.delegate; +package nu.marginalia.array.delegate; import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.array.LongArray; +import nu.marginalia.array.LongArray; import java.io.IOException; import java.nio.channels.FileChannel; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedIntArray.java b/libraries/array/src/main/java/nu/marginalia/array/delegate/ShiftedIntArray.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedIntArray.java rename to libraries/array/src/main/java/nu/marginalia/array/delegate/ShiftedIntArray.java index a920c99a..7e1974c1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedIntArray.java +++ b/libraries/array/src/main/java/nu/marginalia/array/delegate/ShiftedIntArray.java @@ -1,13 +1,13 @@ -package nu.marginalia.util.array.delegate; +package nu.marginalia.array.delegate; import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.array.IntArray; -import nu.marginalia.util.array.algo.SortingContext; -import nu.marginalia.util.array.buffer.IntQueryBuffer; -import nu.marginalia.util.array.functional.IntBinaryIOOperation; -import nu.marginalia.util.array.functional.IntIOTransformer; -import nu.marginalia.util.array.functional.IntTransformer; -import nu.marginalia.util.array.functional.LongIntConsumer; +import nu.marginalia.array.IntArray; +import nu.marginalia.array.algo.SortingContext; +import nu.marginalia.array.buffer.IntQueryBuffer; +import nu.marginalia.array.functional.IntBinaryIOOperation; +import nu.marginalia.array.functional.IntIOTransformer; +import nu.marginalia.array.functional.IntTransformer; +import nu.marginalia.array.functional.LongIntConsumer; import java.io.IOException; import java.nio.IntBuffer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedLongArray.java b/libraries/array/src/main/java/nu/marginalia/array/delegate/ShiftedLongArray.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedLongArray.java rename to libraries/array/src/main/java/nu/marginalia/array/delegate/ShiftedLongArray.java index 53a4f89b..eb681f26 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedLongArray.java +++ b/libraries/array/src/main/java/nu/marginalia/array/delegate/ShiftedLongArray.java @@ -1,14 +1,14 @@ -package nu.marginalia.util.array.delegate; +package nu.marginalia.array.delegate; import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.array.algo.LongArraySearch; -import nu.marginalia.util.array.algo.SortingContext; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.array.functional.LongBinaryIOOperation; -import nu.marginalia.util.array.functional.LongIOTransformer; -import nu.marginalia.util.array.functional.LongLongConsumer; -import nu.marginalia.util.array.functional.LongTransformer; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.algo.LongArraySearch; +import nu.marginalia.array.algo.SortingContext; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.array.functional.LongBinaryIOOperation; +import nu.marginalia.array.functional.LongIOTransformer; +import nu.marginalia.array.functional.LongLongConsumer; +import nu.marginalia.array.functional.LongTransformer; import java.io.IOException; import java.nio.LongBuffer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCall.java b/libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeCall.java similarity index 65% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCall.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeCall.java index 5f96462d..90ed5ad8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCall.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeCall.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; public interface AddressRangeCall { void apply(T array, int start, int end); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCallIO.java b/libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeCallIO.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCallIO.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeCallIO.java index a7fa2867..a52c1941 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCallIO.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeCallIO.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeIntFunction.java b/libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeIntFunction.java similarity index 67% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeIntFunction.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeIntFunction.java index 93b3b58f..30113292 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeIntFunction.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeIntFunction.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; public interface AddressRangeIntFunction { int apply(T array, int start, int end); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeLongFunction.java b/libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeLongFunction.java similarity index 67% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeLongFunction.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeLongFunction.java index ef214419..d97564be 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeLongFunction.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/AddressRangeLongFunction.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; public interface AddressRangeLongFunction { long apply(T array, int start, int end); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntBinaryIOOperation.java b/libraries/array/src/main/java/nu/marginalia/array/functional/IntBinaryIOOperation.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntBinaryIOOperation.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/IntBinaryIOOperation.java index 6761f633..5e3082ed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntBinaryIOOperation.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/IntBinaryIOOperation.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntIOTransformer.java b/libraries/array/src/main/java/nu/marginalia/array/functional/IntIOTransformer.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntIOTransformer.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/IntIOTransformer.java index 96f84477..02675e27 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntIOTransformer.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/IntIOTransformer.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntTransformer.java b/libraries/array/src/main/java/nu/marginalia/array/functional/IntTransformer.java similarity index 62% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntTransformer.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/IntTransformer.java index c1ba44e6..8001299c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntTransformer.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/IntTransformer.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; public interface IntTransformer { int transform(long pos, int old); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongBinaryIOOperation.java b/libraries/array/src/main/java/nu/marginalia/array/functional/LongBinaryIOOperation.java similarity index 74% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongBinaryIOOperation.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/LongBinaryIOOperation.java index c097c016..8141d624 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongBinaryIOOperation.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/LongBinaryIOOperation.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIOTransformer.java b/libraries/array/src/main/java/nu/marginalia/array/functional/LongIOTransformer.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIOTransformer.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/LongIOTransformer.java index 997bcfd8..62fe490a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIOTransformer.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/LongIOTransformer.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIntConsumer.java b/libraries/array/src/main/java/nu/marginalia/array/functional/LongIntConsumer.java similarity index 62% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIntConsumer.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/LongIntConsumer.java index 781ebe9e..a8a4ab2b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIntConsumer.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/LongIntConsumer.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; public interface LongIntConsumer { void accept(long pos, int val); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongLongConsumer.java b/libraries/array/src/main/java/nu/marginalia/array/functional/LongLongConsumer.java similarity index 62% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongLongConsumer.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/LongLongConsumer.java index 6390d59e..4707673d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongLongConsumer.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/LongLongConsumer.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; public interface LongLongConsumer { void accept(long pos, long val); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongTransformer.java b/libraries/array/src/main/java/nu/marginalia/array/functional/LongTransformer.java similarity index 63% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongTransformer.java rename to libraries/array/src/main/java/nu/marginalia/array/functional/LongTransformer.java index 4f998646..8ab0ae7e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongTransformer.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functional/LongTransformer.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.functional; +package nu.marginalia.array.functional; public interface LongTransformer { long transform(long pos, long old); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functor/IntIOFolder.java b/libraries/array/src/main/java/nu/marginalia/array/functor/IntIOFolder.java similarity index 67% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functor/IntIOFolder.java rename to libraries/array/src/main/java/nu/marginalia/array/functor/IntIOFolder.java index b3ee83ce..ee749020 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functor/IntIOFolder.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functor/IntIOFolder.java @@ -1,8 +1,8 @@ -package nu.marginalia.util.array.functor; +package nu.marginalia.array.functor; -import nu.marginalia.util.array.functional.AddressRangeCallIO; -import nu.marginalia.util.array.functional.IntBinaryIOOperation; -import nu.marginalia.util.array.page.IntArrayPage; +import nu.marginalia.array.functional.AddressRangeCallIO; +import nu.marginalia.array.functional.IntBinaryIOOperation; +import nu.marginalia.array.page.IntArrayPage; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functor/LongIOFolder.java b/libraries/array/src/main/java/nu/marginalia/array/functor/LongIOFolder.java similarity index 67% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/functor/LongIOFolder.java rename to libraries/array/src/main/java/nu/marginalia/array/functor/LongIOFolder.java index ce9e796f..2f65d54e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/functor/LongIOFolder.java +++ b/libraries/array/src/main/java/nu/marginalia/array/functor/LongIOFolder.java @@ -1,8 +1,8 @@ -package nu.marginalia.util.array.functor; +package nu.marginalia.array.functor; -import nu.marginalia.util.array.functional.AddressRangeCallIO; -import nu.marginalia.util.array.functional.LongBinaryIOOperation; -import nu.marginalia.util.array.page.LongArrayPage; +import nu.marginalia.array.functional.AddressRangeCallIO; +import nu.marginalia.array.functional.LongBinaryIOOperation; +import nu.marginalia.array.page.LongArrayPage; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java b/libraries/array/src/main/java/nu/marginalia/array/page/AbstractPagingArray.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java rename to libraries/array/src/main/java/nu/marginalia/array/page/AbstractPagingArray.java index c772d43e..27b35dc5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java +++ b/libraries/array/src/main/java/nu/marginalia/array/page/AbstractPagingArray.java @@ -1,14 +1,14 @@ -package nu.marginalia.util.array.page; +package nu.marginalia.array.page; -import nu.marginalia.util.array.algo.BulkTransferArray; -import nu.marginalia.util.array.functional.AddressRangeCall; -import nu.marginalia.util.array.functional.AddressRangeCallIO; -import nu.marginalia.util.array.scheme.ArrayPartitioningScheme; +import nu.marginalia.array.algo.BulkTransferArray; +import nu.marginalia.array.functional.AddressRangeCall; +import nu.marginalia.array.functional.AddressRangeCallIO; +import nu.marginalia.array.scheme.ArrayPartitioningScheme; import java.io.IOException; -import static nu.marginalia.util.array.algo.LongArraySearch.decodeSearchMiss; -import static nu.marginalia.util.array.algo.LongArraySearch.encodeSearchMiss; +import static nu.marginalia.array.algo.LongArraySearch.decodeSearchMiss; +import static nu.marginalia.array.algo.LongArraySearch.encodeSearchMiss; public class AbstractPagingArray, B> { final T[] pages; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/IntArrayPage.java b/libraries/array/src/main/java/nu/marginalia/array/page/IntArrayPage.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/page/IntArrayPage.java rename to libraries/array/src/main/java/nu/marginalia/array/page/IntArrayPage.java index b2270c8c..acb29259 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/IntArrayPage.java +++ b/libraries/array/src/main/java/nu/marginalia/array/page/IntArrayPage.java @@ -1,7 +1,7 @@ -package nu.marginalia.util.array.page; +package nu.marginalia.array.page; import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.array.IntArray; +import nu.marginalia.array.IntArray; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/LongArrayPage.java b/libraries/array/src/main/java/nu/marginalia/array/page/LongArrayPage.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/page/LongArrayPage.java rename to libraries/array/src/main/java/nu/marginalia/array/page/LongArrayPage.java index ed9e3c96..b27df533 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/LongArrayPage.java +++ b/libraries/array/src/main/java/nu/marginalia/array/page/LongArrayPage.java @@ -1,8 +1,8 @@ -package nu.marginalia.util.array.page; +package nu.marginalia.array.page; import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.array.trace.ArrayTrace; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.trace.ArrayTrace; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingIntArray.java b/libraries/array/src/main/java/nu/marginalia/array/page/PagingIntArray.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingIntArray.java rename to libraries/array/src/main/java/nu/marginalia/array/page/PagingIntArray.java index 6b44fecb..fff17c46 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingIntArray.java +++ b/libraries/array/src/main/java/nu/marginalia/array/page/PagingIntArray.java @@ -1,16 +1,16 @@ -package nu.marginalia.util.array.page; +package nu.marginalia.array.page; import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.array.IntArray; -import nu.marginalia.util.array.algo.SortingContext; -import nu.marginalia.util.array.buffer.IntQueryBuffer; -import nu.marginalia.util.array.delegate.ReferenceImplIntArrayDelegate; -import nu.marginalia.util.array.functional.IntBinaryIOOperation; -import nu.marginalia.util.array.functional.IntIOTransformer; -import nu.marginalia.util.array.functional.IntTransformer; -import nu.marginalia.util.array.functional.LongIntConsumer; -import nu.marginalia.util.array.functor.IntIOFolder; -import nu.marginalia.util.array.scheme.ArrayPartitioningScheme; +import nu.marginalia.array.IntArray; +import nu.marginalia.array.algo.SortingContext; +import nu.marginalia.array.buffer.IntQueryBuffer; +import nu.marginalia.array.delegate.ReferenceImplIntArrayDelegate; +import nu.marginalia.array.functional.IntBinaryIOOperation; +import nu.marginalia.array.functional.IntIOTransformer; +import nu.marginalia.array.functional.IntTransformer; +import nu.marginalia.array.functional.LongIntConsumer; +import nu.marginalia.array.functor.IntIOFolder; +import nu.marginalia.array.scheme.ArrayPartitioningScheme; import java.io.IOException; import java.nio.IntBuffer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingLongArray.java b/libraries/array/src/main/java/nu/marginalia/array/page/PagingLongArray.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingLongArray.java rename to libraries/array/src/main/java/nu/marginalia/array/page/PagingLongArray.java index 597979ef..7aa3b1d3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingLongArray.java +++ b/libraries/array/src/main/java/nu/marginalia/array/page/PagingLongArray.java @@ -1,16 +1,16 @@ -package nu.marginalia.util.array.page; +package nu.marginalia.array.page; import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.array.algo.SortingContext; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.array.delegate.ReferenceImplLongArrayDelegate; -import nu.marginalia.util.array.functional.LongBinaryIOOperation; -import nu.marginalia.util.array.functional.LongIOTransformer; -import nu.marginalia.util.array.functional.LongLongConsumer; -import nu.marginalia.util.array.functional.LongTransformer; -import nu.marginalia.util.array.functor.LongIOFolder; -import nu.marginalia.util.array.scheme.ArrayPartitioningScheme; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.algo.SortingContext; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.array.delegate.ReferenceImplLongArrayDelegate; +import nu.marginalia.array.functional.LongBinaryIOOperation; +import nu.marginalia.array.functional.LongIOTransformer; +import nu.marginalia.array.functional.LongLongConsumer; +import nu.marginalia.array.functional.LongTransformer; +import nu.marginalia.array.functor.LongIOFolder; +import nu.marginalia.array.scheme.ArrayPartitioningScheme; import java.io.IOException; import java.nio.LongBuffer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PartitionPage.java b/libraries/array/src/main/java/nu/marginalia/array/page/PartitionPage.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/page/PartitionPage.java rename to libraries/array/src/main/java/nu/marginalia/array/page/PartitionPage.java index c324157c..56afd660 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PartitionPage.java +++ b/libraries/array/src/main/java/nu/marginalia/array/page/PartitionPage.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.page; +package nu.marginalia.array.page; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/ArrayPartitioningScheme.java b/libraries/array/src/main/java/nu/marginalia/array/scheme/ArrayPartitioningScheme.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/ArrayPartitioningScheme.java rename to libraries/array/src/main/java/nu/marginalia/array/scheme/ArrayPartitioningScheme.java index a8063a17..e705d0a3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/ArrayPartitioningScheme.java +++ b/libraries/array/src/main/java/nu/marginalia/array/scheme/ArrayPartitioningScheme.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.scheme; +package nu.marginalia.array.scheme; public interface ArrayPartitioningScheme { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/PowerOf2PartitioningScheme.java b/libraries/array/src/main/java/nu/marginalia/array/scheme/PowerOf2PartitioningScheme.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/PowerOf2PartitioningScheme.java rename to libraries/array/src/main/java/nu/marginalia/array/scheme/PowerOf2PartitioningScheme.java index 20bb453e..98dfac8f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/PowerOf2PartitioningScheme.java +++ b/libraries/array/src/main/java/nu/marginalia/array/scheme/PowerOf2PartitioningScheme.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.scheme; +package nu.marginalia.array.scheme; public class PowerOf2PartitioningScheme implements ArrayPartitioningScheme { final int partitionSize; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/SequentialPartitioningScheme.java b/libraries/array/src/main/java/nu/marginalia/array/scheme/SequentialPartitioningScheme.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/SequentialPartitioningScheme.java rename to libraries/array/src/main/java/nu/marginalia/array/scheme/SequentialPartitioningScheme.java index 19af52d1..7bd3f0a7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/SequentialPartitioningScheme.java +++ b/libraries/array/src/main/java/nu/marginalia/array/scheme/SequentialPartitioningScheme.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.scheme; +package nu.marginalia.array.scheme; public class SequentialPartitioningScheme implements ArrayPartitioningScheme { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTrace.java b/libraries/array/src/main/java/nu/marginalia/array/trace/ArrayTrace.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTrace.java rename to libraries/array/src/main/java/nu/marginalia/array/trace/ArrayTrace.java index 38e08ede..bd3f47b7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTrace.java +++ b/libraries/array/src/main/java/nu/marginalia/array/trace/ArrayTrace.java @@ -1,6 +1,6 @@ -package nu.marginalia.util.array.trace; +package nu.marginalia.array.trace; -import nu.marginalia.util.array.LongArray; +import nu.marginalia.array.LongArray; import java.nio.file.Path; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTraceViz.java b/libraries/array/src/main/java/nu/marginalia/array/trace/ArrayTraceViz.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTraceViz.java rename to libraries/array/src/main/java/nu/marginalia/array/trace/ArrayTraceViz.java index babf727f..87527ce1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTraceViz.java +++ b/libraries/array/src/main/java/nu/marginalia/array/trace/ArrayTraceViz.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.trace; +package nu.marginalia.array.trace; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/FileTrace.java b/libraries/array/src/main/java/nu/marginalia/array/trace/FileTrace.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/trace/FileTrace.java rename to libraries/array/src/main/java/nu/marginalia/array/trace/FileTrace.java index b1fe9c57..3779ea1a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/FileTrace.java +++ b/libraries/array/src/main/java/nu/marginalia/array/trace/FileTrace.java @@ -1,6 +1,6 @@ -package nu.marginalia.util.array.trace; +package nu.marginalia.array.trace; -import nu.marginalia.util.array.LongArray; +import nu.marginalia.array.LongArray; import java.io.IOException; import java.io.PrintStream; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/NullTrace.java b/libraries/array/src/main/java/nu/marginalia/array/trace/NullTrace.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/util/array/trace/NullTrace.java rename to libraries/array/src/main/java/nu/marginalia/array/trace/NullTrace.java index 20e2125f..52bca8c9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/NullTrace.java +++ b/libraries/array/src/main/java/nu/marginalia/array/trace/NullTrace.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.trace; +package nu.marginalia.array.trace; public class NullTrace implements ArrayTrace { diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/IntLowBitPartitioningSchemeTest.java b/libraries/array/src/test/java/nu/marginalia/array/IntLowBitPartitioningSchemeTest.java similarity index 83% rename from marginalia_nu/src/test/java/nu/marginalia/util/array/IntLowBitPartitioningSchemeTest.java rename to libraries/array/src/test/java/nu/marginalia/array/IntLowBitPartitioningSchemeTest.java index 25c42338..cc5b8cd3 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/array/IntLowBitPartitioningSchemeTest.java +++ b/libraries/array/src/test/java/nu/marginalia/array/IntLowBitPartitioningSchemeTest.java @@ -1,6 +1,6 @@ -package nu.marginalia.util.array; +package nu.marginalia.array; -import nu.marginalia.util.array.scheme.SequentialPartitioningScheme; +import nu.marginalia.array.scheme.SequentialPartitioningScheme; import org.junit.jupiter.api.Test; class IntLowBitPartitioningSchemeTest { diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/PagingIntArrayTest.java b/libraries/array/src/test/java/nu/marginalia/array/PagingIntArrayTest.java similarity index 93% rename from marginalia_nu/src/test/java/nu/marginalia/util/array/PagingIntArrayTest.java rename to libraries/array/src/test/java/nu/marginalia/array/PagingIntArrayTest.java index e6acfac8..6b86d8e5 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/array/PagingIntArrayTest.java +++ b/libraries/array/src/test/java/nu/marginalia/array/PagingIntArrayTest.java @@ -1,8 +1,8 @@ -package nu.marginalia.util.array; +package nu.marginalia.array; -import nu.marginalia.util.array.page.PagingIntArray; -import nu.marginalia.util.array.page.PagingLongArray; -import nu.marginalia.util.array.scheme.SequentialPartitioningScheme; +import nu.marginalia.array.page.PagingIntArray; +import nu.marginalia.array.page.PagingLongArray; +import nu.marginalia.array.scheme.SequentialPartitioningScheme; import nu.marginalia.util.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySearchTest.java b/libraries/array/src/test/java/nu/marginalia/array/algo/IntArraySearchTest.java similarity index 93% rename from marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySearchTest.java rename to libraries/array/src/test/java/nu/marginalia/array/algo/IntArraySearchTest.java index c000c16f..48eea698 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySearchTest.java +++ b/libraries/array/src/test/java/nu/marginalia/array/algo/IntArraySearchTest.java @@ -1,9 +1,9 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; -import nu.marginalia.util.array.IntArray; -import nu.marginalia.util.array.buffer.IntQueryBuffer; -import nu.marginalia.util.array.page.PagingIntArray; -import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import nu.marginalia.array.IntArray; +import nu.marginalia.array.buffer.IntQueryBuffer; +import nu.marginalia.array.page.PagingIntArray; +import nu.marginalia.array.scheme.PowerOf2PartitioningScheme; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySortTest.java b/libraries/array/src/test/java/nu/marginalia/array/algo/IntArraySortTest.java similarity index 84% rename from marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySortTest.java rename to libraries/array/src/test/java/nu/marginalia/array/algo/IntArraySortTest.java index 6bcda5b2..67a1ab6b 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySortTest.java +++ b/libraries/array/src/test/java/nu/marginalia/array/algo/IntArraySortTest.java @@ -1,9 +1,9 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; -import nu.marginalia.util.array.IntArray; -import nu.marginalia.util.array.page.IntArrayPage; -import nu.marginalia.util.array.page.PagingIntArray; -import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import nu.marginalia.array.IntArray; +import nu.marginalia.array.page.IntArrayPage; +import nu.marginalia.array.page.PagingIntArray; +import nu.marginalia.array.scheme.PowerOf2PartitioningScheme; import nu.marginalia.util.test.TestUtil; import org.apache.commons.lang3.ArrayUtils; import org.junit.jupiter.api.Assertions; @@ -76,6 +76,7 @@ class IntArraySortTest { for (int i = 0; i < values.length; i++) { values[i] = i; } + ArrayUtils.shuffle(values); int sentinelA = 0xFEEDBEEF; @@ -117,36 +118,36 @@ class IntArraySortTest { @Test void insertionSort() { basic.insertionSort(0, size); - Assertions.assertTrue(basic.isSorted(0, 128)); + assertTrue(basic.isSorted(0, 128)); paged.insertionSort(0, size); - Assertions.assertTrue(paged.isSorted(0, 128)); + assertTrue(paged.isSorted(0, 128)); shifted.insertionSort(0, size); - Assertions.assertTrue(shifted.isSorted(0, 128)); + assertTrue(shifted.isSorted(0, 128)); } @Test void quickSort() { basic.quickSort(0, size); - Assertions.assertTrue(basic.isSorted(0, size)); + assertTrue(basic.isSorted(0, size)); paged.quickSort(0, size); - Assertions.assertTrue(paged.isSorted(0, size)); + assertTrue(paged.isSorted(0, size)); shifted.quickSort(0, size); - Assertions.assertTrue(shifted.isSorted(0, 128)); + assertTrue(shifted.isSorted(0, 128)); } @Test void mergeSort() throws IOException { basic.mergeSort(0, size, Path.of("/tmp")); - Assertions.assertTrue(basic.isSorted(0, size)); + assertTrue(basic.isSorted(0, size)); paged.mergeSort(0, size, Path.of("/tmp")); - Assertions.assertTrue(paged.isSorted(0, size)); + assertTrue(paged.isSorted(0, size)); shifted.mergeSort(0, size, Path.of("/tmp")); - Assertions.assertTrue(shifted.isSorted(0, 128)); + assertTrue(shifted.isSorted(0, 128)); } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArrayTransformationsTest.java b/libraries/array/src/test/java/nu/marginalia/array/algo/IntArrayTransformationsTest.java similarity index 89% rename from marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArrayTransformationsTest.java rename to libraries/array/src/test/java/nu/marginalia/array/algo/IntArrayTransformationsTest.java index 8b9b9773..bd821d69 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArrayTransformationsTest.java +++ b/libraries/array/src/test/java/nu/marginalia/array/algo/IntArrayTransformationsTest.java @@ -1,9 +1,9 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; -import nu.marginalia.util.array.IntArray; -import nu.marginalia.util.array.page.IntArrayPage; -import nu.marginalia.util.array.page.PagingIntArray; -import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import nu.marginalia.array.IntArray; +import nu.marginalia.array.page.IntArrayPage; +import nu.marginalia.array.page.PagingIntArray; +import nu.marginalia.array.scheme.PowerOf2PartitioningScheme; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySearchTest.java b/libraries/array/src/test/java/nu/marginalia/array/algo/LongArraySearchTest.java similarity index 94% rename from marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySearchTest.java rename to libraries/array/src/test/java/nu/marginalia/array/algo/LongArraySearchTest.java index f3cdae16..512355a3 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySearchTest.java +++ b/libraries/array/src/test/java/nu/marginalia/array/algo/LongArraySearchTest.java @@ -1,9 +1,9 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.array.page.PagingLongArray; -import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.array.page.PagingLongArray; +import nu.marginalia.array.scheme.PowerOf2PartitioningScheme; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySortTest.java b/libraries/array/src/test/java/nu/marginalia/array/algo/LongArraySortTest.java similarity index 78% rename from marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySortTest.java rename to libraries/array/src/test/java/nu/marginalia/array/algo/LongArraySortTest.java index e417020f..c81c1f4c 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySortTest.java +++ b/libraries/array/src/test/java/nu/marginalia/array/algo/LongArraySortTest.java @@ -1,9 +1,9 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.array.page.LongArrayPage; -import nu.marginalia.util.array.page.PagingLongArray; -import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.page.LongArrayPage; +import nu.marginalia.array.page.PagingLongArray; +import nu.marginalia.array.scheme.PowerOf2PartitioningScheme; import nu.marginalia.util.test.TestUtil; import org.apache.commons.lang3.ArrayUtils; import org.junit.jupiter.api.Assertions; @@ -116,72 +116,72 @@ class LongArraySortTest { @Test void insertionSort() { basic.insertionSort(0, size); - Assertions.assertTrue(basic.isSorted(0, 128)); + assertTrue(basic.isSorted(0, 128)); paged.insertionSort(0, size); - Assertions.assertTrue(paged.isSorted(0, 128)); + assertTrue(paged.isSorted(0, 128)); shifted.insertionSort(0, size); - Assertions.assertTrue(shifted.isSorted(0, 128)); + assertTrue(shifted.isSorted(0, 128)); } @Test void insertionSortN() { basic.insertionSortN(2, 0, size); - Assertions.assertTrue(basic.isSortedN(2, 0, size)); + assertTrue(basic.isSortedN(2, 0, size)); paged.insertionSortN(2, 0, size); - Assertions.assertTrue(paged.isSortedN(2, 0, size)); + assertTrue(paged.isSortedN(2, 0, size)); shifted.insertionSortN(2, 0, size); - Assertions.assertTrue(shifted.isSortedN(2, 0, size)); + assertTrue(shifted.isSortedN(2, 0, size)); } @Test void quickSort() { basic.quickSort(0, size); - Assertions.assertTrue(basic.isSorted(0, size)); + assertTrue(basic.isSorted(0, size)); paged.quickSort(0, size); - Assertions.assertTrue(paged.isSorted(0, size)); + assertTrue(paged.isSorted(0, size)); shifted.quickSort(0, size); - Assertions.assertTrue(shifted.isSorted(0, size)); + assertTrue(shifted.isSorted(0, size)); } @Test void quickSortN() { basic.quickSortN(2, 0, size); - Assertions.assertTrue(basic.isSortedN(2, 0, size)); + assertTrue(basic.isSortedN(2, 0, size)); paged.quickSortN(2, 0, size); - Assertions.assertTrue(paged.isSortedN(2, 0, size)); + assertTrue(paged.isSortedN(2, 0, size)); shifted.quickSortN(2, 0, size); - Assertions.assertTrue(shifted.isSortedN(2, 0, size)); + assertTrue(shifted.isSortedN(2, 0, size)); } @Test void mergeSortN() throws IOException { basic.mergeSortN(2, 0, size, Path.of("/tmp")); - Assertions.assertTrue(basic.isSortedN(2, 0, size)); + assertTrue(basic.isSortedN(2, 0, size)); paged.mergeSortN(2, 0, size, Path.of("/tmp")); - Assertions.assertTrue(paged.isSortedN(2, 0, size)); + assertTrue(paged.isSortedN(2, 0, size)); shifted.mergeSortN(2, 0, size, Path.of("/tmp")); - Assertions.assertTrue(shifted.isSortedN(2, 0, size)); + assertTrue(shifted.isSortedN(2, 0, size)); } @Test void mergeSort() throws IOException { basic.mergeSort(0, size, Path.of("/tmp")); - Assertions.assertTrue(basic.isSorted(0, size)); + assertTrue(basic.isSorted(0, size)); paged.mergeSort(0, size, Path.of("/tmp")); - Assertions.assertTrue(paged.isSorted(0, size)); + assertTrue(paged.isSorted(0, size)); shifted.mergeSort(0, size, Path.of("/tmp")); - Assertions.assertTrue(shifted.isSorted(0, size)); + assertTrue(shifted.isSorted(0, size)); } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArrayTransformationsTest.java b/libraries/array/src/test/java/nu/marginalia/array/algo/LongArrayTransformationsTest.java similarity index 89% rename from marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArrayTransformationsTest.java rename to libraries/array/src/test/java/nu/marginalia/array/algo/LongArrayTransformationsTest.java index 47f33bb3..b75d17a3 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArrayTransformationsTest.java +++ b/libraries/array/src/test/java/nu/marginalia/array/algo/LongArrayTransformationsTest.java @@ -1,9 +1,9 @@ -package nu.marginalia.util.array.algo; +package nu.marginalia.array.algo; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.array.page.LongArrayPage; -import nu.marginalia.util.array.page.PagingLongArray; -import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.page.LongArrayPage; +import nu.marginalia.array.page.PagingLongArray; +import nu.marginalia.array.scheme.PowerOf2PartitioningScheme; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/scheme/ArrayPartitioningSchemeTest.java b/libraries/array/src/test/java/nu/marginalia/array/scheme/ArrayPartitioningSchemeTest.java similarity index 93% rename from marginalia_nu/src/test/java/nu/marginalia/util/array/scheme/ArrayPartitioningSchemeTest.java rename to libraries/array/src/test/java/nu/marginalia/array/scheme/ArrayPartitioningSchemeTest.java index 6c7cdd40..c0506df0 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/array/scheme/ArrayPartitioningSchemeTest.java +++ b/libraries/array/src/test/java/nu/marginalia/array/scheme/ArrayPartitioningSchemeTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.array.scheme; +package nu.marginalia.array.scheme; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/test/TestUtil.java b/libraries/array/src/test/java/nu/marginalia/util/test/TestUtil.java similarity index 69% rename from marginalia_nu/src/test/java/nu/marginalia/util/test/TestUtil.java rename to libraries/array/src/test/java/nu/marginalia/util/test/TestUtil.java index f3a40e4c..c8f3735a 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/test/TestUtil.java +++ b/libraries/array/src/test/java/nu/marginalia/util/test/TestUtil.java @@ -1,6 +1,5 @@ package nu.marginalia.util.test; -import lombok.SneakyThrows; import java.io.File; import java.io.IOException; @@ -12,7 +11,7 @@ public class TestUtil { private static boolean isTempDir(Path dir) { return dir.startsWith("/tmp") || dir.toString().contains("tmp"); } - @SneakyThrows + public static void clearTempDir(Path dir) { if (!isTempDir(dir)) { throw new IllegalArgumentException("Refusing to recursively delete directory with that name"); @@ -31,13 +30,18 @@ public class TestUtil { dir.toFile().delete(); } - private static String fileSize(Path path) throws IOException { - long sizeBytes = Files.size(path); + private static String fileSize(Path path) { + try { + long sizeBytes = Files.size(path); - if (sizeBytes > 1024*1024*1024) return round(sizeBytes / 1073741824.) + "Gb"; - if (sizeBytes > 1024*1024) return round(sizeBytes / 1048576.) + "Mb"; - if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; - return sizeBytes + "b"; + if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; + if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; + if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; + return sizeBytes + "b"; + } + catch (IOException ex) { + throw new RuntimeException(ex); + } } private static String round(double d) { diff --git a/libraries/btree/build.gradle b/libraries/btree/build.gradle new file mode 100644 index 00000000..270ef2f9 --- /dev/null +++ b/libraries/btree/build.gradle @@ -0,0 +1,26 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':libraries:array') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} diff --git a/libraries/btree/src/main/java/nu/marginalia/btree/BTreeDogEar.java b/libraries/btree/src/main/java/nu/marginalia/btree/BTreeDogEar.java new file mode 100644 index 00000000..c3e1e345 --- /dev/null +++ b/libraries/btree/src/main/java/nu/marginalia/btree/BTreeDogEar.java @@ -0,0 +1,46 @@ +package nu.marginalia.btree; + +import nu.marginalia.array.LongArray; +import nu.marginalia.btree.model.BTreeContext; +import nu.marginalia.btree.model.BTreeHeader; + +/* + * End-of-block mark that's used as a sentinel to verify that + * the BTreeWriter's caller actually writes as much as they say + * they want to. (Failing to do so will corrupt the tree) + * + */ +class BTreeDogEar { + + private final LongArray sentinelSlice; + + public static BTreeDogEar empty() { + return new BTreeDogEar(null); + } + + public static BTreeDogEar create(BTreeContext ctx, BTreeHeader header, LongArray base) { + + if (header.numEntries() > 3) { + var sentinelSlice = base.range( + (long) header.numEntries() * ctx.entrySize() - 3, + (long) header.numEntries() * ctx.entrySize()); + sentinelSlice.set(0, 4L); + sentinelSlice.set(1, 5L); + sentinelSlice.set(2, 1L); + return new BTreeDogEar(sentinelSlice); + } + + return BTreeDogEar.empty(); + } + private BTreeDogEar(LongArray sentinelSlice) { + this.sentinelSlice = sentinelSlice; + } + + public boolean verify() { + if (sentinelSlice == null) + return true; + + return 4 != sentinelSlice.get(0) || 5 != sentinelSlice.get(1) || 1 != sentinelSlice.get(2); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/libraries/btree/src/main/java/nu/marginalia/btree/BTreeReader.java similarity index 69% rename from marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java rename to libraries/btree/src/main/java/nu/marginalia/btree/BTreeReader.java index f8bdd1f6..47e6e69b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java +++ b/libraries/btree/src/main/java/nu/marginalia/btree/BTreeReader.java @@ -1,16 +1,15 @@ -package nu.marginalia.util.btree; +package nu.marginalia.btree; -import lombok.SneakyThrows; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.array.algo.LongArraySearch; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.array.delegate.ShiftedLongArray; -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.btree.model.BTreeHeader; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.algo.LongArraySearch; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.array.delegate.ShiftedLongArray; +import nu.marginalia.btree.model.BTreeContext; +import nu.marginalia.btree.model.BTreeHeader; import static java.lang.Math.min; -public class BTreeReader implements BTreeReaderIf { +public class BTreeReader { private final LongArray index; private final ShiftedLongArray data; @@ -22,7 +21,7 @@ public class BTreeReader implements BTreeReaderIf { public BTreeReader(LongArray file, BTreeContext ctx, long offset) { this.ctx = ctx; - this.header = createHeader(file, offset); + this.header = readHeader(file, offset); dataBlockEnd = (long) ctx.entrySize() * header.numEntries(); index = file.range(header.indexOffsetLongs(), header.dataOffsetLongs()); @@ -30,10 +29,8 @@ public class BTreeReader implements BTreeReaderIf { } - public static BTreeHeader createHeader(LongArray file, long fileOffset) { - long[] parts = new long[3]; - file.get(fileOffset, fileOffset+3, parts); - return new BTreeHeader(parts[0], parts[1], parts[2]); + public static BTreeHeader readHeader(LongArray file, long fileOffset) { + return new BTreeHeader(file, fileOffset); } public BTreeHeader getHeader() { @@ -44,7 +41,7 @@ public class BTreeReader implements BTreeReaderIf { return header.numEntries(); } - @SneakyThrows + /** Keeps all items in buffer that exist in the btree */ public void retainEntries(LongQueryBuffer buffer) { if (header.layers() == 0) { BTreePointer pointer = new BTreePointer(header); @@ -55,7 +52,7 @@ public class BTreeReader implements BTreeReaderIf { retainSingle(buffer); } - @SneakyThrows + /** Removes all items in buffer that exist in the btree */ public void rejectEntries(LongQueryBuffer buffer) { if (header.layers() == 0) { BTreePointer pointer = new BTreePointer(header); @@ -100,7 +97,7 @@ public class BTreeReader implements BTreeReaderIf { } - /** + /** Locate entry in btree * * @return file offset of entry matching keyRaw, negative if absent */ @@ -118,43 +115,72 @@ public class BTreeReader implements BTreeReaderIf { data.get(pos, pos + n, buf); } + /** Used for querying interlaced data in the btree. + *

+ * If entry size is e.g. 2, the data is positioned like [key1, data1, key2, data2, key3, data3] + * then given keys=[key1, key3], and offset=1 (i.e. look 1 step to the right), the return value will be + * [data1, data3]. + *

+ * For each item in the keys array where the key is not found in the btree, the value will be zero. + *

+ * Caveat: The keys are assumed to be sorted. + */ public long[] queryData(long[] keys, int offset) { - BTreePointer pointer = new BTreePointer(header); - long[] ret = new long[keys.length]; - - // this function could be re-written like retain() and would be - // much faster + if (getClass().desiredAssertionStatus()) { + assert(isSorted(keys)); + } if (header.layers() == 0) { - long searchStart = 0; - for (int i = 0; i < keys.length; i++) { - long key = keys[i]; - searchStart = data.binarySearchN(ctx.entrySize(), key, searchStart, data.size); - if (searchStart < 0) { - searchStart = LongArraySearch.decodeSearchMiss(searchStart); - } - else { - ret[i] = data.get(searchStart + offset); - } - } - + return queryJustIndex(keys, offset); } else { - for (int i = 0; i < keys.length; i++) { - if (i > 0) { - pointer.resetToRoot(); - } + return queryBtree(keys, offset); + } + } - if (pointer.walkToData(keys[i])) { - long dataAddress = pointer.findData(keys[i]); - if (dataAddress >= 0) { - ret[i] = data.get(dataAddress + offset); - } + private boolean isSorted(long[] keys) { + for (int i = 1; i < keys.length; i++) { + if (keys[i] < keys[i-1]) + return false; + } + return true; + } + + private long[] queryJustIndex(long[] keys, int offset) { + long[] ret = new long[keys.length]; + + long searchStart = 0; + for (int i = 0; i < keys.length; i++) { + long key = keys[i]; + searchStart = data.binarySearchN(ctx.entrySize(), key, searchStart, data.size); + if (searchStart < 0) { + searchStart = LongArraySearch.decodeSearchMiss(searchStart); + } + else { + ret[i] = data.get(searchStart + offset); + } + } + return ret; + } + + private long[] queryBtree(long[] keys, int offset) { + BTreePointer pointer = new BTreePointer(header); + long[] ret = new long[keys.length]; + + // FIXME: this function could be re-written like retain() and would be much faster + for (int i = 0; i < keys.length; i++) { + if (i > 0) { + pointer.resetToRoot(); + } + + if (pointer.walkToData(keys[i])) { + long dataAddress = pointer.findData(keys[i]); + if (dataAddress >= 0) { + ret[i] = data.get(dataAddress + offset); } } } - return ret; } @@ -192,11 +218,11 @@ public class BTreeReader implements BTreeReaderIf { final long searchStart = layerOffsets[layer] + offset; - final long nextLayerOffset = (int) index.binarySearchUpperBound(key, searchStart, searchStart + ctx.BLOCK_SIZE_WORDS()) - searchStart; + final long nextLayerOffset = (int) index.binarySearchUpperBound(key, searchStart, searchStart + ctx.blockSizeWords()) - searchStart; layer --; boundary = index.get(searchStart + nextLayerOffset); - offset = ctx.BLOCK_SIZE_WORDS() * (offset + nextLayerOffset); + offset = ctx.blockSizeWords() * (offset + nextLayerOffset); return true; } @@ -229,7 +255,7 @@ public class BTreeReader implements BTreeReaderIf { remainingBlock = (layerOffsets.length == 0) ? remainingTotal - : (long) ctx.BLOCK_SIZE_WORDS() * ctx.entrySize(); + : (long) ctx.blockSizeWords() * ctx.entrySize(); long searchEnd = searchStart + (int) min(remainingTotal, remainingBlock); @@ -247,7 +273,7 @@ public class BTreeReader implements BTreeReaderIf { long relOffset = dataOffset - blockBase; long remainingTotal = dataBlockEnd - dataOffset; - long remainingBlock = ctx.BLOCK_SIZE_WORDS() - relOffset; + long remainingBlock = ctx.blockSizeWords() - relOffset; long searchEnd = dataOffset + (int) min(remainingTotal, remainingBlock); @@ -271,7 +297,7 @@ public class BTreeReader implements BTreeReaderIf { long relOffset = dataOffset - blockBase; long remainingTotal = dataBlockEnd - dataOffset; - long remainingBlock = ctx.BLOCK_SIZE_WORDS() - relOffset; + long remainingBlock = ctx.blockSizeWords() - relOffset; long searchEnd = dataOffset + (int) min(remainingTotal, remainingBlock); diff --git a/libraries/btree/src/main/java/nu/marginalia/btree/BTreeWriteCallback.java b/libraries/btree/src/main/java/nu/marginalia/btree/BTreeWriteCallback.java new file mode 100644 index 00000000..2e63bd86 --- /dev/null +++ b/libraries/btree/src/main/java/nu/marginalia/btree/BTreeWriteCallback.java @@ -0,0 +1,9 @@ +package nu.marginalia.btree; + +import nu.marginalia.array.LongArray; + +import java.io.IOException; + +public interface BTreeWriteCallback { + void write(LongArray slice) throws IOException; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java b/libraries/btree/src/main/java/nu/marginalia/btree/BTreeWriter.java similarity index 64% rename from marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java rename to libraries/btree/src/main/java/nu/marginalia/btree/BTreeWriter.java index a6ad6f91..a5512e03 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java +++ b/libraries/btree/src/main/java/nu/marginalia/btree/BTreeWriter.java @@ -1,10 +1,9 @@ -package nu.marginalia.util.btree; +package nu.marginalia.btree; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.btree.model.BTreeHeader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.delegate.ShiftedLongArray; +import nu.marginalia.btree.model.BTreeContext; +import nu.marginalia.btree.model.BTreeHeader; import java.io.IOException; @@ -12,7 +11,6 @@ import java.io.IOException; public class BTreeWriter { private final BTreeContext ctx; private final LongArray map; - private final Logger logger = LoggerFactory.getLogger(getClass()); public BTreeWriter(LongArray map, BTreeContext ctx) { this.map = map; @@ -35,24 +33,24 @@ public class BTreeWriter { * * @return The size of the written data */ - public long write(long offset, int numEntries, WriteCallback writeIndexCallback) + public long write(long offset, int numEntries, BTreeWriteCallback writeIndexCallback) throws IOException { BTreeHeader header = makeHeader(offset, numEntries); - header.write(map, offset); + writeHeader(header, map, offset); final long startRange = header.dataOffsetLongs(); final long endRange = startRange + (long) numEntries * ctx.entrySize(); var slice = map.range(startRange, endRange); - BTreeDogEar dogEar = new BTreeDogEar(ctx, header, slice); + final BTreeDogEar dogEar = createDogEar(ctx, header, slice); writeIndexCallback.write(slice); if (!dogEar.verify()) { - logger.error("Dog ear was not overwritten: {}", header); + throw new IllegalStateException("Dog ear was not overwritten: " + header); } if (header.layers() >= 1) { // Omit layer if data fits within a single block @@ -62,10 +60,25 @@ public class BTreeWriter { return ctx.calculateSize(numEntries); } + private void writeHeader(BTreeHeader header, LongArray map, long offset) { + map.set(offset, ((long) header.layers() << 32L) | ((long)header.numEntries() & 0xFFFF_FFFFL)); + map.set(offset+1, header.indexOffsetLongs()); + map.set(offset+2, header.dataOffsetLongs()); + } + + private BTreeDogEar createDogEar(BTreeContext ctx, BTreeHeader header, ShiftedLongArray slice) { + if (BTreeWriter.class.desiredAssertionStatus()) { + return BTreeDogEar.create(ctx, header, slice); + } + else { + return BTreeDogEar.empty(); + } + } + public static BTreeHeader makeHeader(BTreeContext ctx, long offset, int numEntries) { final int numLayers = ctx.numIndexLayers(numEntries); - final int padding = BTreeHeader.getPadding(ctx, offset, numLayers); + final int padding = getHeaderPadding(ctx, offset, numLayers); final long indexOffset = offset + BTreeHeader.BTreeHeaderSizeLongs + padding; final long dataOffset = indexOffset + indexSize(ctx, numEntries, numLayers); @@ -73,6 +86,22 @@ public class BTreeWriter { return new BTreeHeader(numLayers, numEntries, indexOffset, dataOffset); } + + private static int getHeaderPadding(BTreeContext ctx, long offset, int numLayers) { + final int padding; + if (numLayers == 0) { + padding = 0; + } + else { + /* If this the amount of data is big enough to be a b-tree and not just + * a sorted list, there needs to be padding between the header and the index + * in order to get aligned blocks + */ + padding = (int) (ctx.blockSizeWords() - ((offset + BTreeHeader.BTreeHeaderSizeLongs) % ctx.blockSizeWords())); + } + return padding; + } + public BTreeHeader makeHeader(long offset, int numEntries) { return makeHeader(ctx, offset, numEntries); } @@ -81,7 +110,7 @@ public class BTreeWriter { private void writeIndex(BTreeHeader header) { var layerOffsets = header.getRelativeLayerOffsets(ctx); - long indexedDataStepSize = ctx.BLOCK_SIZE_WORDS(); + long indexedDataStepSize = ctx.blockSizeWords(); /* Index layer 0 indexes the data itself Index layer 1 indexes layer 0 @@ -89,7 +118,7 @@ public class BTreeWriter { And so on */ for (int layer = 0; layer < header.layers(); layer++, - indexedDataStepSize*=ctx.BLOCK_SIZE_WORDS()) { + indexedDataStepSize*=ctx.blockSizeWords()) { writeIndexLayer(header, layerOffsets, indexedDataStepSize, layer); } @@ -124,8 +153,8 @@ public class BTreeWriter { final long trailerStart = indexOffsetBase + indexWord; final long trailerEnd = trailerStart - + ctx.BLOCK_SIZE_WORDS() - - (int) (indexWord % ctx.BLOCK_SIZE_WORDS()); + + ctx.blockSizeWords() + - (int) (indexWord % ctx.blockSizeWords()); if (trailerStart < trailerEnd) { map.fill(trailerStart, trailerEnd, Long.MAX_VALUE); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java b/libraries/btree/src/main/java/nu/marginalia/btree/model/BTreeContext.java similarity index 63% rename from marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java rename to libraries/btree/src/main/java/nu/marginalia/btree/model/BTreeContext.java index d335d320..350c1ff9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java +++ b/libraries/btree/src/main/java/nu/marginalia/btree/model/BTreeContext.java @@ -1,11 +1,11 @@ -package nu.marginalia.util.btree.model; +package nu.marginalia.btree.model; -import nu.marginalia.util.btree.BTreeWriter; +import nu.marginalia.btree.BTreeWriter; -public record BTreeContext(int MAX_LAYERS, +public record BTreeContext(int maxLayers, int entrySize, - int BLOCK_SIZE_BITS, - int BLOCK_SIZE_WORDS) { + int blockSizeBits, + int blockSizeWords) { // 8 pages is the breaking point where using a B-tree is actually advantageous // over just binary searching in a sorted list. Above 8 pages, binary search will @@ -25,26 +25,26 @@ public record BTreeContext(int MAX_LAYERS, } public int numIndexLayers(int numEntries) { - if (numEntries <= BLOCK_SIZE_WORDS*MIN_PAGES_FOR_BTREE/entrySize) { + if (numEntries <= blockSizeWords *MIN_PAGES_FOR_BTREE/entrySize) { return 0; } - for (int i = 1; i < MAX_LAYERS; i++) { - long div = (1L << (BLOCK_SIZE_BITS*i)); + for (int i = 1; i < maxLayers; i++) { + long div = (1L << (blockSizeBits *i)); long frq = numEntries / div; - if (frq < (1L << BLOCK_SIZE_BITS)) { + if (frq < (1L << blockSizeBits)) { if (numEntries == (numEntries & div)) { return i; } return i+1; } } - return MAX_LAYERS; + return maxLayers; } public long indexLayerSize(int numWords, int level) { - final long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1)); + final long layerSize = 1L<<(blockSizeBits *(level+1)); - return BLOCK_SIZE_WORDS * (numWords / layerSize + Long.signum(numWords % layerSize)); + return blockSizeWords * (numWords / layerSize + Long.signum(numWords % layerSize)); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java b/libraries/btree/src/main/java/nu/marginalia/btree/model/BTreeHeader.java similarity index 60% rename from marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java rename to libraries/btree/src/main/java/nu/marginalia/btree/model/BTreeHeader.java index a0dc3be3..e400c5b0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java +++ b/libraries/btree/src/main/java/nu/marginalia/btree/model/BTreeHeader.java @@ -1,6 +1,6 @@ -package nu.marginalia.util.btree.model; +package nu.marginalia.btree.model; -import nu.marginalia.util.array.LongArray; +import nu.marginalia.array.LongArray; public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) { public BTreeHeader { @@ -16,30 +16,17 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon public BTreeHeader(long a, long b, long c) { this((int)(a >>> 32), (int)(a & 0xFFFF_FFFFL), b, c); } - - public static int getPadding(BTreeContext ctx, long offset, int numLayers) { - final int padding; - if (numLayers == 0) { - padding = 0; - } - else { - padding = (int) (ctx.BLOCK_SIZE_WORDS() - ((offset + BTreeHeader.BTreeHeaderSizeLongs) % ctx.BLOCK_SIZE_WORDS())); - } - return padding; + public BTreeHeader(LongArray array, long offset) { + this(array.get(offset), array.get(offset+1), array.get(offset+2)); } - public void write(LongArray dest, long offset) { - dest.set(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL)); - dest.set(offset+1, indexOffsetLongs); - dest.set(offset+2, dataOffsetLongs); - } - - public long relativeIndexLayerOffset(BTreeContext ctx, int n) { long offset = 0; + for (int i = n+1; i < layers; i++) { offset += ctx.indexLayerSize( numEntries, i); } + return offset; } diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/libraries/btree/src/test/java/nu/marginalia/btree/BTreeWriterTest.java similarity index 56% rename from marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java rename to libraries/btree/src/test/java/nu/marginalia/btree/BTreeWriterTest.java index 611d3ddd..af4798f9 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java +++ b/libraries/btree/src/test/java/nu/marginalia/btree/BTreeWriterTest.java @@ -1,39 +1,43 @@ -package nu.marginalia.util.btree; +package nu.marginalia.btree; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.btree.model.BTreeHeader; +import nu.marginalia.array.LongArray; +import nu.marginalia.btree.model.BTreeContext; +import nu.marginalia.btree.model.BTreeHeader; import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.HashSet; import java.util.Set; import java.util.StringJoiner; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.LongStream; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; class BTreeWriterTest { - final BTreeContext ctx = new BTreeContext(4, 2, 3); - final BTreeWriter writer = new BTreeWriter(null, ctx); - Logger logger = LoggerFactory.getLogger(getClass()); + @Test void testSmallDataBlock() { - var header = writer.makeHeader(1024, ctx.BLOCK_SIZE_WORDS()/2); + BTreeContext ctx = new BTreeContext(4, 2, 3); + BTreeWriter writer = new BTreeWriter(null, ctx); + + var header = writer.makeHeader(1024, ctx.blockSizeWords()/2); assertEquals(1024 + BTreeHeader.BTreeHeaderSizeLongs, header.dataOffsetLongs()); assertEquals(header.dataOffsetLongs(), header.indexOffsetLongs()); } @Test void testLayerCount() { - int wsq = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS(); - int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS(); + BTreeContext ctx = new BTreeContext(4, 2, 3); + BTreeWriter writer = new BTreeWriter(null, ctx); + + int wsq = ctx.blockSizeWords()*ctx.blockSizeWords(); + int wcub = ctx.blockSizeWords()*ctx.blockSizeWords()*ctx.blockSizeWords(); assertEquals(2, writer.makeHeader(1024, wsq-1).layers()); assertEquals(2, writer.makeHeader(1024, wsq).layers()); @@ -46,7 +50,10 @@ class BTreeWriterTest { @Test void testLayerOffset() { - int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS(); + BTreeContext ctx = new BTreeContext(4, 2, 3); + BTreeWriter writer = new BTreeWriter(null, ctx); + + int wcub = ctx.blockSizeWords()*ctx.blockSizeWords()*ctx.blockSizeWords(); System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 0)); System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 1)); System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 2)); @@ -58,7 +65,7 @@ class BTreeWriterTest { printTreeLayout(i, header, ctx); if (header.layers() >= 1) { - assertEquals(1, ctx.indexLayerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS()); + assertEquals(1, ctx.indexLayerSize(i, header.layers() - 1) / ctx.blockSizeWords()); } } } @@ -66,44 +73,30 @@ class BTreeWriterTest { private void printTreeLayout(int numEntries, BTreeHeader header, BTreeContext ctx) { StringJoiner sj = new StringJoiner(","); for (int l = 0; l < header.layers(); l++) { - sj.add(""+ctx.indexLayerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS()); + sj.add(""+ctx.indexLayerSize(numEntries, l)/ctx.blockSizeWords()); } System.out.println(numEntries + ":" + sj); } @Test public void testWriteEntrySize2() throws IOException { + BTreeContext ctx = new BTreeContext(4, 2, 3); var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - for (int i = 0; i < 64; i++) { - while (!toPut.add((int)(Integer.MAX_VALUE * Math.random()))); - } - - int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray(); + int[] data = generateItems32(64); try { LongArray longArray = LongArray.allocate(10000); - { - var writer = new BTreeWriter(longArray, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.set(2L*i, data[i]); - slice.set( 2L*i + 1, i); - } - }); - } + writeIntEntrySize2(data, ctx, longArray); - { - var reader = new BTreeReader(longArray, ctx, 0); - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(data[i]); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - offset += reader.getHeader().dataOffsetLongs(); - assertEquals(i, longArray.get(offset+1)); - } + var reader = new BTreeReader(longArray, ctx, 0); + for (int i = 0; i < data.length; i++) { + long offset = reader.findEntry(data[i]); + assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); + offset += reader.getHeader().dataOffsetLongs(); + assertEquals(i, longArray.get(offset+1)); } } catch (Exception e) { e.printStackTrace(); @@ -114,23 +107,15 @@ class BTreeWriterTest { @Test public void testWriteEntrySize2Small() throws IOException { + BTreeContext ctx = new BTreeContext(4, 2, 3); - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); + int[] data = generateItems32(5); + Set items = IntStream.of(data).boxed().collect(Collectors.toSet()); - for (int i = 0; i < 5; i++) { - while (!toPut.add((int)(Integer.MAX_VALUE * Math.random()))); - } - - int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray(); LongArray array = LongArray.allocate(22000); - var writer = new BTreeWriter(array, ctx); - writer.write( 0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.set(2L*i, data[i]); - slice.set(2L*i + 1, i); - } - }); + + writeIntEntrySize2(data, ctx, array); + var reader = new BTreeReader(array, ctx, 0); for (int i = 0; i < data.length; i++) { long offset = reader.findEntry(data[i]); @@ -141,7 +126,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long)(Long.MAX_VALUE * Math.random()); - while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random()); + while (items.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random()); assertTrue(reader.findEntry( val) < 0); } } @@ -150,28 +135,16 @@ class BTreeWriterTest { @Test public void testWriteEqualityNotMasked() throws IOException { for (int bs = 2; bs <= 4; bs++) { - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - var ctx = new BTreeContext(5, 1, bs); - - for (int i = 0; i < 500; i++) { - while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ; - } - - long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray(); + long[] data = generateItems64(500); + Set items = LongStream.of(data).boxed().collect(Collectors.toSet()); LongArray array = LongArray.allocate(22000); - var writer = new BTreeWriter(array, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.set(i, data[i]); - } - }); + writeLongEntrySize1(data, ctx, array); var reader = new BTreeReader(array, ctx, 0); - printTreeLayout(toPut.size(), reader.getHeader(), ctx); + printTreeLayout(data.length, reader.getHeader(), ctx); for (int i = 0; i < data.length; i++) { long offset = reader.findEntry(data[i]); @@ -182,10 +155,38 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long) (Long.MAX_VALUE * Math.random()); - while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); + while (items.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); assertTrue(reader.findEntry( val) < 0); } } } + public int[] generateItems32(int n) { + return IntStream.generate(() -> (int) (Integer.MAX_VALUE * Math.random())).distinct().limit(n).sorted().toArray(); + } + + public long[] generateItems64(int n) { + return LongStream.generate(() -> (long) (Long.MAX_VALUE * Math.random())).distinct().limit(n).sorted().toArray(); + } + + private void writeIntEntrySize2(int[] data, BTreeContext ctx, LongArray array) throws IOException { + var writer = new BTreeWriter(array, ctx); + writer.write(0, data.length, (slice) -> { + for (int i = 0; i < data.length; i++) { + slice.set(2L*i, data[i]); + slice.set(2L*i + 1, i); + } + }); + } + + private void writeLongEntrySize1(long[] data, BTreeContext ctx, LongArray array) throws IOException { + var writer = new BTreeWriter(array, ctx); + writer.write(0, data.length, (slice) -> { + for (int i = 0; i < data.length; i++) { + slice.set(i, data[i]); + } + }); + } + + } \ No newline at end of file diff --git a/libraries/language-processing/build.gradle b/libraries/language-processing/build.gradle new file mode 100644 index 00000000..ae614d46 --- /dev/null +++ b/libraries/language-processing/build.gradle @@ -0,0 +1,50 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id "me.champeau.jmh" version "0.6.6" + id "de.undercouch.download" version "5.1.0" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:config') + implementation project(':libraries:misc') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + implementation libs.notnull + + implementation libs.guice + implementation libs.jsoup + implementation libs.trove + + implementation libs.bundles.nlp + implementation libs.commons.lang3 + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/LanguageFilter.java b/libraries/language-processing/src/main/java/nu/marginalia/language/LanguageFilter.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/LanguageFilter.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/LanguageFilter.java index 7649201b..b4ba2793 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/LanguageFilter.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/LanguageFilter.java @@ -1,6 +1,7 @@ -package nu.marginalia.util.language; +package nu.marginalia.language; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.language.encoding.UnicodeRanges; +import nu.marginalia.language.model.DocumentLanguageData; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java b/libraries/language-processing/src/main/java/nu/marginalia/language/WordPatterns.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/WordPatterns.java index c2fc0045..270d6810 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/WordPatterns.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.language; +package nu.marginalia.language; import org.apache.commons.lang3.StringUtils; diff --git a/libraries/language-processing/src/main/java/nu/marginalia/language/encoding/AsciiFlattener.java b/libraries/language-processing/src/main/java/nu/marginalia/language/encoding/AsciiFlattener.java new file mode 100644 index 00000000..77cc1196 --- /dev/null +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/encoding/AsciiFlattener.java @@ -0,0 +1,130 @@ +package nu.marginalia.language.encoding; + +public class AsciiFlattener { + + public static String flattenUnicode(String s) { + + if (isPlainAscii(s)) { + return s; + } + + StringBuilder sb = new StringBuilder(s.length()); + + int numCp = s.codePointCount(0, s.length()); + + // Falsehoods programmers believe about the latin alphabet ;-) + + for (int i = 0; i < numCp; i++) { + int c = s.codePointAt(i); + + if ("\u201C\u201D".indexOf(c) >= 0) { + sb.append('"'); + } + else if ("áâàȁăåäāǟãąą̊ḁẚⱥ".indexOf(c) >= 0) { + sb.append('a'); + } + else if ("ḃḅḇƀɓ".indexOf(c) >= 0) { + sb.append('b'); + } + else if ("ćĉčçḉċƈȼ".indexOf(c) >= 0) { + sb.append('c'); + } + else if ("ɗḓďḋḍḏḑđðɖḏ".indexOf(c) >= 0) { + sb.append('d'); + } + else if ("éêèȅěëēẽĕęėẹȇḕḗḙḛḝɇ".indexOf(c) >= 0) { + sb.append('e'); + } + else if ("ḟƒ".indexOf(c) >= 0) { + sb.append('f'); + } + else if ("ǵĝǧğġģɠḡǥ".indexOf(c) >= 0) { + sb.append('g'); + } + else if ("ĥȟḧḣḥẖḩḫħⱨ".indexOf(c) >= 0) { + sb.append('g'); + } + else if ("iıíîìȉïḯīĩįịḭ".indexOf(c) >= 0) { + sb.append('i'); + } + else if ("ĵǰɉ".indexOf(c) >= 0) { + sb.append('j'); + } + else if ("ḱǩķḳḵƙⱪ".indexOf(c) >= 0) { + sb.append('k'); + } + else if ("ĺłḽľļḷḹḻƚɫⱡ".indexOf(c) >= 0) { + sb.append('l'); + } + else if ("ḿṁṃ".indexOf(c) >= 0) { + sb.append('m'); + } + else if ("ŋńǹñṋňṅṇṉʼnn̈ņ".indexOf(c) >= 0) { + sb.append('n'); + } + else if ("óőôòȍŏȯȱöȫōṓṑõṍṏȭøǿǫǭọȏơ".indexOf(c) >= 0) { + sb.append('o'); + } + else if ("ṕṗƥᵽ".indexOf(c) >= 0) { + sb.append('p'); + } + else if ("ꝗ".indexOf(c) >= 0) { + sb.append('q'); + } + else if ("ŕȑřŗṙṛṝṟɍɽ".indexOf(c) >= 0) { + sb.append('r'); + } + else if ("śṥŝšṧşșṡṣṩ".indexOf(c) >= 0) { + sb.append('s'); + } + else if ("ťṱẗţțŧṫṭṯⱦ".indexOf(c) >= 0) { + sb.append('t'); + } + else if ("úùûŭưűüūṻųůũṹụṳṵṷʉ".indexOf(c) >= 0) { + sb.append('u'); + } + else if ("ṽṿʋỽ".indexOf(c) >= 0) { + sb.append('v'); + } + else if ("ẃŵẁẅẘẇẉⱳ".indexOf(c) >= 0) { + sb.append('w'); + } + else if ("x̂ẍẋ".indexOf(c) >= 0) { + sb.append('x'); + } + else if ("ƴýŷỳÿȳỹẙẏy̨ɏỿ".indexOf(c) >= 0) { + sb.append('y'); + } + else if ("źẑžżẓẕƶȥ".indexOf(c) >= 0) { + sb.append('z'); + } + else if ("Þþ".indexOf(c) >= 0) { + sb.append("th"); + } + else if ('ß' == c) { + sb.append("ss"); + } + else if (isAscii(c)) { + sb.append((char) c); + } + } + + return sb.toString(); + } + + private static boolean isPlainAscii(String s) { + int i; + + int numCp = s.codePointCount(0, s.length()); + + for (i = 0; i < numCp && isAscii(s.codePointAt(i)); i++); + + return i == s.length(); + } + + private static boolean isAscii(int c) { + return (c & ~0x7f) == 0; + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/HtmlTagCleaner.java b/libraries/language-processing/src/main/java/nu/marginalia/language/encoding/HtmlTagCleaner.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/HtmlTagCleaner.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/encoding/HtmlTagCleaner.java index 17156c3b..c2865f14 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/HtmlTagCleaner.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/encoding/HtmlTagCleaner.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.language.processing; +package nu.marginalia.language.encoding; import org.jsoup.nodes.Document; import org.jsoup.nodes.TextNode; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java b/libraries/language-processing/src/main/java/nu/marginalia/language/encoding/UnicodeRanges.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/encoding/UnicodeRanges.java index bd1d3043..d690269b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/encoding/UnicodeRanges.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.language; +package nu.marginalia.language.encoding; public enum UnicodeRanges { GREEK(false, 0x0370,0x03FF), diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java b/libraries/language-processing/src/main/java/nu/marginalia/language/keywords/KeywordExtractor.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/keywords/KeywordExtractor.java index 8673ac4c..a00df4e7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/keywords/KeywordExtractor.java @@ -1,9 +1,9 @@ -package nu.marginalia.util.language.processing; +package nu.marginalia.language.keywords; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.util.language.processing.model.DocumentSentence; -import nu.marginalia.util.language.processing.model.WordSpan; -import nu.marginalia.util.language.processing.model.tag.WordSeparator; +import nu.marginalia.language.WordPatterns; +import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.model.WordSpan; +import nu.marginalia.language.model.WordSeparator; import java.lang.ref.SoftReference; import java.util.ArrayList; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java b/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentLanguageData.java similarity index 68% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentLanguageData.java index 89b95fd0..a40fd637 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentLanguageData.java @@ -1,8 +1,9 @@ -package nu.marginalia.util.language.processing.model; +package nu.marginalia.language.model; import gnu.trove.map.hash.TObjectIntHashMap; import lombok.AllArgsConstructor; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.lsh.EasyLSH; import java.util.Arrays; import java.util.stream.Stream; @@ -31,4 +32,15 @@ public class DocumentLanguageData { public Stream stream() { return Arrays.stream(sentences).map(sent -> sent.words).flatMap(Arrays::stream); } + + public long localitySensitiveHashCode() { + var hash = new EasyLSH(); + + for (var sent : sentences) { + for (var word : sent) { + hash.addUnordered(word.word()); + } + } + return hash.get(); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java b/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentSentence.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentSentence.java index 0f0ae0aa..cf5d7488 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentSentence.java @@ -1,7 +1,7 @@ -package nu.marginalia.util.language.processing.model; +package nu.marginalia.language.model; -import nu.marginalia.util.language.WordPatterns; +import nu.marginalia.language.WordPatterns; import org.jetbrains.annotations.NotNull; import java.lang.ref.SoftReference; @@ -52,7 +52,37 @@ public class DocumentSentence implements Iterable{ return words.length; } - private String removeJunk(String s) { + public String constructWordFromSpan(WordSpan span) { + if (span.size() == 1) { + return trimJunkCharacters(wordsLowerCase[span.start]); + } + else { + StringJoiner sj = new StringJoiner("_"); + for (int i = span.start; i < span.end; i++) { + sj.add(wordsLowerCase[i]); + } + return trimJunkCharacters(sj.toString()); + } + } + + public String constructStemmedWordFromSpan(WordSpan span) { + if (span.size() > 1) { + + StringJoiner sj = new StringJoiner("_"); + for (int i = span.start; i < span.end; i++) { + if (includeInStemming(i)) + sj.add(normalizeJoiner(stemmedWords[i])); + + } + return sj.toString(); + } + else if (includeInStemming(span.start)) { + return normalizeJoiner(stemmedWords[span.start]); + } + else return ""; + } + + private String trimJunkCharacters(String s) { int start = 0; int end = s.length(); @@ -73,21 +103,6 @@ public class DocumentSentence implements Iterable{ return s; } } - - public String constructWordFromSpan(WordSpan span) { - if (span.size() == 1) { - return removeJunk(wordsLowerCase[span.start]); - } - else { - StringJoiner sj = new StringJoiner("_"); - for (int i = span.start; i < span.end; i++) { - sj.add(wordsLowerCase[i]); - } - return removeJunk(sj.toString()); - } - } - - private String normalizeJoiner(String s) { if (s.indexOf('+') >= 0) { @@ -101,22 +116,6 @@ public class DocumentSentence implements Iterable{ } return s; } - public String constructStemmedWordFromSpan(WordSpan span) { - if (span.size() > 1) { - - StringJoiner sj = new StringJoiner("_"); - for (int i = span.start; i < span.end; i++) { - if (includeInStemming(i)) - sj.add(normalizeJoiner(stemmedWords[i])); - - } - return sj.toString(); - } - else if (includeInStemming(span.start)) { - return normalizeJoiner(stemmedWords[span.start]); - } - else return ""; - } private boolean includeInStemming(int i) { if (posTags[i].equals("IN") || posTags[i].equals("TO") || posTags[i].equals("CC") || posTags[i].equals("DT")) { @@ -125,7 +124,6 @@ public class DocumentSentence implements Iterable{ return true; } - @Override public String toString() { return IntStream.range(0, length()).mapToObj(i -> String.format("%s[%s]", words[i], posTags[i])).collect(Collectors.joining(" ")); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java b/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java index 58e53551..b44b510d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java @@ -1,8 +1,7 @@ -package nu.marginalia.util.language.processing.model; +package nu.marginalia.language.model; -import nu.marginalia.util.language.processing.KeywordCounter; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; +import nu.marginalia.model.idx.EdgePageWordMetadata; +import nu.marginalia.model.crawl.EdgePageWordFlags; import java.util.EnumSet; import java.util.HashMap; @@ -11,7 +10,7 @@ import java.util.HashSet; public record KeywordMetadata(HashSet titleKeywords, HashSet subjectKeywords, HashSet namesKeywords, - HashMap wordsTfIdf, + HashMap wordsTfIdf, HashMap positionMask, EnumSet wordFlagsTemplate ) @@ -28,10 +27,10 @@ public record KeywordMetadata(HashSet titleKeywords, this(EnumSet.noneOf(EdgePageWordFlags.class)); } - private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0); + private static final WordFrequencyData empty = new WordFrequencyData(0, 0); public long getMetadataForWord(EnumSet flagsTemplate, String stemmed) { - KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty); + WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty); EnumSet flags = flagsTemplate.clone(); if (subjectKeywords.contains(stemmed)) diff --git a/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordFrequencyData.java b/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordFrequencyData.java new file mode 100644 index 00000000..3435a702 --- /dev/null +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordFrequencyData.java @@ -0,0 +1,4 @@ +package nu.marginalia.language.model; + + +public record WordFrequencyData(int count, int tfIdfNormalized) { } \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java b/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordRep.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/model/WordRep.java index 5f87894f..541539f7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordRep.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.language.processing.model; +package nu.marginalia.language.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/tag/WordSeparator.java b/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordSeparator.java similarity index 66% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/tag/WordSeparator.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/model/WordSeparator.java index 231ea7cd..3476073f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/tag/WordSeparator.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordSeparator.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.language.processing.model.tag; +package nu.marginalia.language.model; public final class WordSeparator { public static final int COMMA = 0; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordSpan.java b/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordSpan.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordSpan.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/model/WordSpan.java index 44c20a7f..82ea4e48 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordSpan.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/model/WordSpan.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.language.processing.model; +package nu.marginalia.language.model; import lombok.AllArgsConstructor; import lombok.EqualsAndHashCode; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractor.java b/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractor.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java index 2957eaa9..abb4ec8a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractor.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -1,17 +1,17 @@ -package nu.marginalia.util.language.processing.sentence; +package nu.marginalia.language.sentence; import com.github.datquocnguyen.RDRPOSTagger; -import com.github.jknack.handlebars.internal.lang3.StringUtils; import gnu.trove.map.hash.TObjectIntHashMap; import lombok.SneakyThrows; +import nu.marginalia.LanguageModels; +import nu.marginalia.language.encoding.HtmlTagCleaner; import nu.marginalia.util.StringPool; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.HtmlTagCleaner; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.util.language.processing.model.DocumentSentence; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.DocumentSentence; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.stemmer.PorterStemmer; +import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -81,7 +81,7 @@ public class SentenceExtractor { Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse(""); if (title.trim().length() < 3) { - title = Optional.ofNullable(doc.getElementsByTag("h2").first()).map(Element::text).orElse(""); + title = doc.getElementsByTag("h2").text(); } if (title.trim().length() < 3 && textSentences.length > 0) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractorStringUtils.java b/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractorStringUtils.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java index 08a1605c..41f27c24 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractorStringUtils.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.language.processing.sentence; +package nu.marginalia.language.sentence; import java.util.Arrays; import java.util.Objects; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceSegmentSplitter.java b/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceSegmentSplitter.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java index 6a4516cf..4eb0dccf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceSegmentSplitter.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java @@ -1,14 +1,14 @@ -package nu.marginalia.util.language.processing.sentence; +package nu.marginalia.language.sentence; import gnu.trove.list.array.TIntArrayList; import lombok.AllArgsConstructor; import lombok.Getter; -import nu.marginalia.util.language.processing.model.tag.WordSeparator; +import nu.marginalia.language.model.WordSeparator; import java.util.ArrayList; import java.util.List; -import static nu.marginalia.util.language.WordPatterns.*; +import static nu.marginalia.language.WordPatterns.*; public class SentenceSegmentSplitter { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java b/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/EnglishDictionary.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/statistics/EnglishDictionary.java index 9a9fefd0..f861fdda 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/EnglishDictionary.java @@ -1,7 +1,6 @@ -package nu.marginalia.wmsa.edge.search.query; +package nu.marginalia.language.statistics; import com.google.inject.Inject; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramBloomFilter.java b/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java similarity index 61% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramBloomFilter.java rename to libraries/language-processing/src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java index cb463c61..c842ee5b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramBloomFilter.java +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java @@ -1,22 +1,18 @@ -package nu.marginalia.wmsa.edge.assistant.dict; +package nu.marginalia.language.statistics; import ca.rmen.porterstemmer.PorterStemmer; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.google.inject.Inject; +import nu.marginalia.LanguageModels; import nu.marginalia.util.DenseBitMap; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournalFile; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; public class NGramBloomFilter { @@ -27,10 +23,6 @@ public class NGramBloomFilter { private static final Logger logger = LoggerFactory.getLogger(NGramBloomFilter.class); @Inject - public NGramBloomFilter() throws IOException { - this(WmsaHome.getLanguageModels()); - } - public NGramBloomFilter(LanguageModels lm) throws IOException { this(loadSafely(lm.ngramBloomFilter)); } @@ -55,29 +47,29 @@ public class NGramBloomFilter { return bitMap.get(bit); } - public static void main(String... args) throws IOException { - var filter = convertFromDictionaryFile(new File(args[0])); - filter.bitMap.writeToFile(Path.of(args[1])); - } +// public static void main(String... args) throws IOException { +// var filter = convertFromDictionaryFile(new File(args[0])); +// filter.bitMap.writeToFile(Path.of(args[1])); +// } public static NGramBloomFilter load(Path file) throws IOException { return new NGramBloomFilter(DenseBitMap.loadFromFile(file)); } - public static NGramBloomFilter convertFromDictionaryFile(File file) throws IOException { - DenseBitMap bitMap = new DenseBitMap(1024*1024*1024L); - AtomicInteger popCount = new AtomicInteger(); - try (var f = new KeywordLexiconJournalFile(file)) { - f.loadFile(data -> { - long bit = bitForWord(new String(data), bitMap.cardinality); - if (!bitMap.set(bit)) - popCount.incrementAndGet(); - }); - } - - System.out.println("popcount = " + popCount.get()); - return new NGramBloomFilter(bitMap); - } +// public static NGramBloomFilter convertFromDictionaryFile(File file) throws IOException { +// DenseBitMap bitMap = new DenseBitMap(1024*1024*1024L); +// AtomicInteger popCount = new AtomicInteger(); +// try (var f = new KeywordLexiconJournalFile(file)) { +// f.loadFile(data -> { +// long bit = bitForWord(new String(data), bitMap.cardinality); +// if (!bitMap.set(bit)) +// popCount.incrementAndGet(); +// }); +// } +// +// System.out.println("popcount = " + popCount.get()); +// return new NGramBloomFilter(bitMap); +// } private static final Pattern underscore = Pattern.compile("_"); diff --git a/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/TermFrequencyDict.java b/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/TermFrequencyDict.java new file mode 100644 index 00000000..d96a9625 --- /dev/null +++ b/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/TermFrequencyDict.java @@ -0,0 +1,206 @@ +package nu.marginalia.language.statistics; + +import ca.rmen.porterstemmer.PorterStemmer; +import gnu.trove.map.hash.TLongIntHashMap; +import nu.marginalia.LanguageModels; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import javax.inject.Inject; +import javax.inject.Singleton; +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +@Singleton +public class TermFrequencyDict { + + private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0); + + private final Logger logger = LoggerFactory.getLogger(getClass()); + private static final Pattern separator = Pattern.compile("[_ ]+"); + private static final PorterStemmer ps = new PorterStemmer(); + + private static final long DOC_COUNT_KEY = ~0L; + private static long fileSize(Path p) throws IOException { + return Files.size(p); + } + + @Inject + public TermFrequencyDict(@Nullable LanguageModels models) { + if (models == null) { + return; + } + + if (models.termFrequencies != null) { + + try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.termFrequencies.toFile())))) { + + wordRates.ensureCapacity((int)(fileSize(models.termFrequencies)/16)); + + for (;;) { + wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong()); + } + } catch (EOFException eof) { + // ok + } catch (IOException e) { + logger.error("IO Exception reading " + models.termFrequencies, e); + } + } + + logger.info("Read {} N-grams frequencies", wordRates.size()); + } + + + public int docCount() { + int cnt = wordRates.get(DOC_COUNT_KEY); + + if (cnt == 0) { + cnt = 11820118; // legacy + } + return cnt; + } + +// +// public static void main(String... args) throws IOException, InterruptedException { +// if (args.length != 2) { +// System.err.println("Expected arguments: plan.yaml out-file"); +// } +// String outFile = args[1]; +// +// var plan = new CrawlPlanLoader().load(Path.of(args[0])); +// +// ThreadLocal se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels())); +// LanguageFilter lf = new LanguageFilter(); +// +// TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1); +// +// ForkJoinPool fjp = new ForkJoinPool(24); +// AtomicInteger docCount = new AtomicInteger(); +// +// for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine +// +// if (domain.doc == null) +// continue; +// +// fjp.execute(() -> { +// +// TLongHashSet words = new TLongHashSet(10_000); +// +// for (var doc : domain.doc) { +// +// if (doc.documentBody == null) +// continue; +// docCount.incrementAndGet(); +// +// Document parsed = Jsoup.parse(doc.documentBody.decode()); +// parsed.body().filter(new DomPruningFilter(0.5)); +// +// DocumentLanguageData dld = se.get().extractSentences(parsed); +// +// if (lf.dictionaryAgreement(dld) < 0.1) { +// return; +// } +// +// for (var sent : dld.sentences) { +// for (var word : sent) { +// words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); +// } +// } +// +// synchronized (counts) { +// words.forEach(w -> { +// counts.adjustOrPutValue(w, 1, 1); +// return true; +// }); +// } +// +// words.clear(); +// } +// +// System.out.println(domain.domain + "\t" + counts.size()); +// }); +// +// +// } +// +// fjp.shutdown(); +// fjp.awaitTermination(10, TimeUnit.DAYS); +// +// try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) { +// synchronized (counts) { +// counts.put(DOC_COUNT_KEY, docCount.get()); +// +// counts.forEachEntry((hash, cnt) -> { +// try { +// dos.writeLong(hash); +// dos.writeLong(cnt); +// } catch (IOException e) { +// throw new RuntimeException(e); +// } +// return true; +// }); +// } +// } +// +// System.out.println(docCount.get()); +// } + + public static long getStringHash(String s) { + String[] strings = separator.split(s); + if (s.length() > 1) { + byte[][] parts = new byte[strings.length][]; + for (int i = 0; i < parts.length; i++) { + parts[i] = ps.stemWord(strings[i]).getBytes(); + } + return longHash(parts); + } + else { + return longHash(s.getBytes()); + } + } + public long getTermFreqHash(long hash) { + return wordRates.get(hash); + } + public long getTermFreq(String s) { + return wordRates.get(getStringHash(s)); + } + public long getTermFreqStemmed(String s) { + return wordRates.get(longHash(s.getBytes())); + } + + public static String getStemmedString(String s) { + String[] strings = separator.split(s); + if (s.length() > 1) { + return Arrays.stream(strings).map(ps::stemWord).collect(Collectors.joining("_")); + } + else { + return s; + } + + } + + public static long longHash(byte[]... bytesSets) { + if (bytesSets == null || bytesSets.length == 0) + return 0; + + // https://cp-algorithms.com/string/string-hashing.html + int p = 127; + long m = (1L<<61)-1; + long p_power = 1; + long hash_val = 0; + + for (byte[] bytes: bytesSets) { + for (byte element : bytes) { + hash_val = (hash_val + (element + 1) * p_power) % m; + p_power = (p_power * p) % m; + } + } + return hash_val; + } + +} diff --git a/marginalia_nu/src/main/resources/dictionary/en-1000 b/libraries/language-processing/src/main/resources/dictionary/en-1000 similarity index 100% rename from marginalia_nu/src/main/resources/dictionary/en-1000 rename to libraries/language-processing/src/main/resources/dictionary/en-1000 diff --git a/marginalia_nu/src/main/resources/dictionary/en-stopwords b/libraries/language-processing/src/main/resources/dictionary/en-stopwords similarity index 100% rename from marginalia_nu/src/main/resources/dictionary/en-stopwords rename to libraries/language-processing/src/main/resources/dictionary/en-stopwords diff --git a/marginalia_nu/src/main/resources/dictionary/en-words b/libraries/language-processing/src/main/resources/dictionary/en-words similarity index 100% rename from marginalia_nu/src/main/resources/dictionary/en-words rename to libraries/language-processing/src/main/resources/dictionary/en-words diff --git a/marginalia_nu/src/main/resources/dictionary/latin-1000 b/libraries/language-processing/src/main/resources/dictionary/latin-1000 similarity index 100% rename from marginalia_nu/src/main/resources/dictionary/latin-1000 rename to libraries/language-processing/src/main/resources/dictionary/latin-1000 diff --git a/marginalia_nu/src/main/resources/dictionary/swe-1000 b/libraries/language-processing/src/main/resources/dictionary/swe-1000 similarity index 100% rename from marginalia_nu/src/main/resources/dictionary/swe-1000 rename to libraries/language-processing/src/main/resources/dictionary/swe-1000 diff --git a/marginalia_nu/src/main/resources/dictionary/word-frequency b/libraries/language-processing/src/main/resources/dictionary/word-frequency similarity index 100% rename from marginalia_nu/src/main/resources/dictionary/word-frequency rename to libraries/language-processing/src/main/resources/dictionary/word-frequency diff --git a/libraries/language-processing/src/test/java/nu/marginalia/language/encoding/AsciiFlattenerTest.java b/libraries/language-processing/src/test/java/nu/marginalia/language/encoding/AsciiFlattenerTest.java new file mode 100644 index 00000000..afa65a36 --- /dev/null +++ b/libraries/language-processing/src/test/java/nu/marginalia/language/encoding/AsciiFlattenerTest.java @@ -0,0 +1,38 @@ +package nu.marginalia.language.encoding; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class AsciiFlattenerTest { + + @Test + void flattenUnicodePlainAscii() { + String s = "abc"; + + // If the string is ascii, we don't want to allocate a copy + + assertSame(s, AsciiFlattener.flattenUnicode(s)); + } + + @Test + void flattenUnicode() { + String s = "Stülpnagelstraße"; + + assertEquals("Stulpnagelstrasse", AsciiFlattener.flattenUnicode(s)); + } + + @Test + void flattenUnicode2() { + String s = "Koncevičius"; + + assertEquals("Koncevicius", AsciiFlattener.flattenUnicode(s)); + } + + @Test + void omitNonFlattenable() { + String s = "[アグレッシブ烈子]"; + + assertEquals("[]", AsciiFlattener.flattenUnicode(s)); + } +} \ No newline at end of file diff --git a/libraries/misc/build.gradle b/libraries/misc/build.gradle new file mode 100644 index 00000000..366df5c0 --- /dev/null +++ b/libraries/misc/build.gradle @@ -0,0 +1,30 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.notnull + implementation libs.lz4 + implementation libs.fastutil + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/BigString.java b/libraries/misc/src/main/java/nu/marginalia/bigstring/BigString.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/util/bigstring/BigString.java rename to libraries/misc/src/main/java/nu/marginalia/bigstring/BigString.java index 48c4c053..5bf88180 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/BigString.java +++ b/libraries/misc/src/main/java/nu/marginalia/bigstring/BigString.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.bigstring; +package nu.marginalia.bigstring; public interface BigString { static BigString encode(String stringValue) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/CompressedBigString.java b/libraries/misc/src/main/java/nu/marginalia/bigstring/CompressedBigString.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/util/bigstring/CompressedBigString.java rename to libraries/misc/src/main/java/nu/marginalia/bigstring/CompressedBigString.java index 1b84e576..dabd84a9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/CompressedBigString.java +++ b/libraries/misc/src/main/java/nu/marginalia/bigstring/CompressedBigString.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.bigstring; +package nu.marginalia.bigstring; import net.jpountz.lz4.LZ4Compressor; import net.jpountz.lz4.LZ4Factory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/PlainBigString.java b/libraries/misc/src/main/java/nu/marginalia/bigstring/PlainBigString.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/util/bigstring/PlainBigString.java rename to libraries/misc/src/main/java/nu/marginalia/bigstring/PlainBigString.java index 5af3a5c8..a90fae67 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/PlainBigString.java +++ b/libraries/misc/src/main/java/nu/marginalia/bigstring/PlainBigString.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.bigstring; +package nu.marginalia.bigstring; import java.nio.charset.StandardCharsets; diff --git a/libraries/misc/src/main/java/nu/marginalia/dict/DictionaryData.java b/libraries/misc/src/main/java/nu/marginalia/dict/DictionaryData.java new file mode 100644 index 00000000..830ed4a7 --- /dev/null +++ b/libraries/misc/src/main/java/nu/marginalia/dict/DictionaryData.java @@ -0,0 +1,39 @@ +package nu.marginalia.dict; + +import java.util.ArrayList; + +public class DictionaryData { + private final int bankSize; + + private final ArrayList banks = new ArrayList<>(100); + + public DictionaryData(int bankSize) { + this.bankSize = bankSize; + + banks.add(new DictionaryDataBank(0, bankSize)); + } + + public int add(long key) { + var activeBank = banks.get(banks.size()-1); + int rb = activeBank.add(key); + + if (rb == -1) { + int end = activeBank.getEnd(); + var newBank = new DictionaryDataBank(end, bankSize); + rb = newBank.add(key); + + banks.add(newBank); + } + + return rb; + } + + + public long getKey(int offset) { + return banks.get(offset/ bankSize).getKey(offset); + } + public boolean keyEquals(int offset, long otherKey) { + return banks.get(offset/ bankSize).keyEquals(offset, otherKey); + } + +} diff --git a/libraries/misc/src/main/java/nu/marginalia/dict/DictionaryDataBank.java b/libraries/misc/src/main/java/nu/marginalia/dict/DictionaryDataBank.java new file mode 100644 index 00000000..75798dcb --- /dev/null +++ b/libraries/misc/src/main/java/nu/marginalia/dict/DictionaryDataBank.java @@ -0,0 +1,63 @@ +package nu.marginalia.dict; + +import java.nio.ByteBuffer; +import java.nio.LongBuffer; + +class DictionaryDataBank { + + private final int start_idx; + + // Humongous long-lived arrays seem to sometimes yield considerable memory overhead and + // can make the GC behave poorly. Using off-heap memory seems preferred when their + // lifetime is "forever" + + private final LongBuffer keys; + + private int size; + private final int capacity; + + + public DictionaryDataBank(int start_idx, int sz) { + this.start_idx = start_idx; + this.capacity = sz; + + keys = ByteBuffer.allocateDirect(8 * capacity).asLongBuffer(); + size = 0; + } + + public int getStart() { + return start_idx; + } + + public int getEnd() { + return start_idx + size; + } + + public long getKey(int idx) { + if (idx < start_idx || idx - start_idx >= size) { + throw new IndexOutOfBoundsException(idx); + } + return keys.get(idx - start_idx); + } + + public boolean keyEquals(int idx, long other) { + if (idx < start_idx || idx - start_idx >= size) { + throw new IndexOutOfBoundsException(idx); + } + + return keys.get(idx - start_idx) == other; + } + + public int add(long newKey) { + if (size >= capacity) + return -1; + + keys.put(size, newKey); + + return start_idx + size++; + } + + public int getSize() { + return size; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java b/libraries/misc/src/main/java/nu/marginalia/dict/DictionaryMap.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java rename to libraries/misc/src/main/java/nu/marginalia/dict/DictionaryMap.java index fb13893e..64bd8030 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java +++ b/libraries/misc/src/main/java/nu/marginalia/dict/DictionaryMap.java @@ -1,10 +1,10 @@ -package nu.marginalia.util.dict; +package nu.marginalia.dict; public interface DictionaryMap { int NO_VALUE = Integer.MIN_VALUE; static DictionaryMap create() { - if (Boolean.getBoolean("small-ram")) { + if (!Boolean.getBoolean("large-ram")) { return new OnHeapDictionaryMap(); } else { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/OffHeapDictionaryHashMap.java b/libraries/misc/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java similarity index 76% rename from marginalia_nu/src/main/java/nu/marginalia/util/dict/OffHeapDictionaryHashMap.java rename to libraries/misc/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java index f906c45a..781c3b5c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/OffHeapDictionaryHashMap.java +++ b/libraries/misc/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java @@ -1,30 +1,22 @@ -package nu.marginalia.util.dict; +package nu.marginalia.dict; -import io.prometheus.client.Gauge; import nu.marginalia.util.PrimeUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.nio.ByteBuffer; import java.nio.IntBuffer; import java.util.concurrent.atomic.AtomicInteger; import static java.lang.Math.round; -import static nu.marginalia.util.FileSizeUtil.readableSize; /** * Spiritually influenced by GNU Trove's hash maps * LGPL 2.1 */ public class OffHeapDictionaryHashMap implements DictionaryMap { - private static final Logger logger = LoggerFactory.getLogger(OffHeapDictionaryHashMap.class); - private static final Gauge probe_count_metrics - = Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count") - .register(); private final int bufferCount; - private final IntBuffer[] buffers; + private final IntBuffer[] buffers; private final DictionaryData dictionaryData; private final long hashTableSize; @@ -47,22 +39,8 @@ public class OffHeapDictionaryHashMap implements DictionaryMap { bufferSizeBytes = intSize*intsPerBuffer; maxProbeLength = sizeMemory/10; - logger.info("Allocating dictionary hash map of size {}, capacity: {}", - readableSize((long) bufferCount * bufferSizeBytes), - hashTableSize); - - logger.info("available-size:{} memory-size:{} buffer-count: {}, buffer-size:{} ints-per-buffer:{} max-probe-length:{}", - hashTableSize, sizeMemory, bufferCount, bufferSizeBytes, intsPerBuffer, maxProbeLength); - if (((long) bufferCount * intsPerBuffer) < sizeMemory) { - logger.error("Buffer memory is less than requested memory: {}*{} = {} < {}; this data structure is not safe to use", - bufferCount, - bufferSizeBytes, (long) bufferCount * bufferSizeBytes, - sizeMemory); - throw new Error("Irrecoverable logic error"); - } - else { - logger.debug("Buffer size sanity checked passed"); + throw new Error("Buffer memory is less than requested memory; this data structure is not safe to use"); } dictionaryData = new DictionaryData((int)Math.min(1<<27, Math.max(32L, sizeMemory/4))); @@ -124,8 +102,6 @@ public class OffHeapDictionaryHashMap implements DictionaryMap { final int val = getCell(idx); if (val == NO_VALUE) { - probe_count_metrics.set(j); - return setValue(key, idx); } else if (dictionaryData.keyEquals(val, key)) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/OnHeapDictionaryMap.java b/libraries/misc/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/util/dict/OnHeapDictionaryMap.java rename to libraries/misc/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java index a9f4063f..067c2cdc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/OnHeapDictionaryMap.java +++ b/libraries/misc/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.dict; +package nu.marginalia.dict; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegex.java b/libraries/misc/src/main/java/nu/marginalia/gregex/GuardedRegex.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegex.java rename to libraries/misc/src/main/java/nu/marginalia/gregex/GuardedRegex.java index 7ba29096..85353f0c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegex.java +++ b/libraries/misc/src/main/java/nu/marginalia/gregex/GuardedRegex.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.gregex; +package nu.marginalia.gregex; import java.util.function.Predicate; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegexFactory.java b/libraries/misc/src/main/java/nu/marginalia/gregex/GuardedRegexFactory.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegexFactory.java rename to libraries/misc/src/main/java/nu/marginalia/gregex/GuardedRegexFactory.java index 800fc621..50131bf7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegexFactory.java +++ b/libraries/misc/src/main/java/nu/marginalia/gregex/GuardedRegexFactory.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.gregex; +package nu.marginalia.gregex; import org.intellij.lang.annotations.Language; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/EasyLSH.java b/libraries/misc/src/main/java/nu/marginalia/lsh/EasyLSH.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/util/EasyLSH.java rename to libraries/misc/src/main/java/nu/marginalia/lsh/EasyLSH.java index 8b709da0..5b168f07 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/EasyLSH.java +++ b/libraries/misc/src/main/java/nu/marginalia/lsh/EasyLSH.java @@ -1,7 +1,4 @@ -package nu.marginalia.util; - -import java.util.List; -import java.util.Set; +package nu.marginalia.lsh; /** This is a very simple locality sensitive hash for collections of Java objects. *

@@ -36,30 +33,32 @@ public class EasyLSH { } public void addHashOrdered(int hashCode) { - hashCode = shingleHash(hashCode); addHashUnordered(shingleHash(hashCode)); } public void addHashUnordered(int hashCode) { - int value = 1-(hashCode & 2); + int value = 1- (hashCode & 2); // Try to extract all the remaining entropy // into selecting the field to update - int field = 63 & (((hashCode >>> 2) - ^ (hashCode >>> 10) - ^ (hashCode >>> 18) - ^ (hashCode >>> 26))); + int field = (hashCode >> 2) + ^ (hashCode >>> 8) + ^ (hashCode >>> 14) + ^ (hashCode >>> 20) + ^ (hashCode >>> 26); - fields[field] += value; + fields[field & 63] += value; } private int shingleHash(int nextHash) { prevHashes[prevHashIdx++ & (SHINGLING-1)] = nextHash; + int ret = 0; for (int hashPart : prevHashes) { - ret ^= hashPart; + ret = hashPart ^ ret; } + return ret; } @@ -73,10 +72,6 @@ public class EasyLSH { return val; } - public int hammingDistance(EasyLSH other) { - return hammingDistance(this, other); - } - public static int hammingDistance(long a, long b) { return Long.bitCount(a^b); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/FileSizeUtil.java b/libraries/misc/src/main/java/nu/marginalia/util/FileSizeUtil.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/util/FileSizeUtil.java rename to libraries/misc/src/main/java/nu/marginalia/util/FileSizeUtil.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/PrimeUtil.java b/libraries/misc/src/main/java/nu/marginalia/util/PrimeUtil.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/util/PrimeUtil.java rename to libraries/misc/src/main/java/nu/marginalia/util/PrimeUtil.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java b/libraries/misc/src/main/java/nu/marginalia/util/RandomWriteFunnel.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java rename to libraries/misc/src/main/java/nu/marginalia/util/RandomWriteFunnel.java index 4e21c76e..6c0b03b0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java +++ b/libraries/misc/src/main/java/nu/marginalia/util/RandomWriteFunnel.java @@ -1,6 +1,5 @@ package nu.marginalia.util; -import lombok.SneakyThrows; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,8 +33,7 @@ public class RandomWriteFunnel implements AutoCloseable { bins = new ArrayList<>(); } - @SneakyThrows - public void put(long address, long data) { + public void put(long address, long data) throws IOException { int bin = (int)(address / binSize); int offset = (int)(address%binSize); @@ -46,8 +44,7 @@ public class RandomWriteFunnel implements AutoCloseable { bins.get(bin).put(offset, data); } - @SneakyThrows - private void grow(int bin) { + private void grow(int bin) throws IOException { while (bins.size() <= bin) { bins.add(new DataBin(tempDir, binSize)); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/TransformList.java b/libraries/misc/src/main/java/nu/marginalia/util/TransformList.java similarity index 79% rename from marginalia_nu/src/main/java/nu/marginalia/util/TransformList.java rename to libraries/misc/src/main/java/nu/marginalia/util/TransformList.java index f8aac39e..352b39cb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/TransformList.java +++ b/libraries/misc/src/main/java/nu/marginalia/util/TransformList.java @@ -5,6 +5,31 @@ import java.util.function.BiConsumer; import java.util.function.Consumer; import java.util.function.Predicate; +/** Recursive descent-style parser utility. + *

+ * Allows readable and easy mutation of a list. + *

+ * Examples: + * + *
+ *     tl.transformEach(entity -> {
+ *         if (foo(entity.value))
+ *           entity.remove();
+ *         if (bar(entity.value))
+ *           entity.replace(Bar(10));
+ *     }
+ *
+ *     tl.transformEachPair((a,b) -> {
+ *          if ("-".equals(a.value.str) && Number.equals(b.value.type)) {
+ *              a.remove();
+ *              b.replace(new Number(-b.value.val));
+ *          }
+ *     }
+ *
+ *     list.scanAndTransform(TokenType.LPAREN, TokenType.RPAREN, itemsBetween -> { });
+ * 
+ *
+ */ public class TransformList { private final List backingList; @@ -85,12 +110,18 @@ public class TransformList { } + /** Represents a mutable item in the transform list */ public class Entity { - public T value; + private T value; private Action action; Entity(T value) { this.value = value; + this.action = Action.NO_OP; + } + + public T value() { + return value; } public void replace(T newValue) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/EasyLSHTest.java b/libraries/misc/src/test/java/nu/marginalia/lsh/EasyLSHTest.java similarity index 93% rename from marginalia_nu/src/test/java/nu/marginalia/util/EasyLSHTest.java rename to libraries/misc/src/test/java/nu/marginalia/lsh/EasyLSHTest.java index e8c3f147..e86425f0 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/EasyLSHTest.java +++ b/libraries/misc/src/test/java/nu/marginalia/lsh/EasyLSHTest.java @@ -1,5 +1,6 @@ -package nu.marginalia.util; +package nu.marginalia.lsh; +import nu.marginalia.lsh.EasyLSH; import org.junit.jupiter.api.Test; import java.util.Arrays; @@ -52,14 +53,14 @@ class EasyLSHTest { """; EasyLSH hashA = new EasyLSH(); - Arrays.stream(sA.split("\\s")).forEach(hashA::addOrdered); + Arrays.stream(sA.split("\\s+")).forEach(hashA::addOrdered); EasyLSH hashB = new EasyLSH(); - Arrays.stream(sB.split("\\s")).forEach(hashB::addOrdered); + Arrays.stream(sB.split("\\s+")).forEach(hashB::addOrdered); EasyLSH hashC = new EasyLSH(); - Arrays.stream(sC.split("\\s")).forEach(hashC::addOrdered); + Arrays.stream(sC.split("\\s+")).forEach(hashC::addOrdered); EasyLSH hashD = new EasyLSH(); - Arrays.stream(sD.split("\\s")).forEach(hashD::addOrdered); + Arrays.stream(sD.split("\\s+")).forEach(hashD::addOrdered); System.out.println(Long.toBinaryString(hashA.get())); System.out.println(Long.toBinaryString(hashB.get())); diff --git a/libraries/misc/src/test/java/nu/marginalia/test/TestUtil.java b/libraries/misc/src/test/java/nu/marginalia/test/TestUtil.java new file mode 100644 index 00000000..44a489bb --- /dev/null +++ b/libraries/misc/src/test/java/nu/marginalia/test/TestUtil.java @@ -0,0 +1,50 @@ +package nu.marginalia.test; + + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; + +public class TestUtil { + private static boolean isTempDir(Path dir) { + return dir.startsWith("/tmp") || dir.toString().contains("tmp"); + } + + public static void clearTempDir(Path dir) { + if (!isTempDir(dir)) { + throw new IllegalArgumentException("Refusing to recursively delete directory with that name"); + } + if (Files.isDirectory(dir)) { + for (File f : dir.toFile().listFiles()) { + File[] files = f.listFiles(); + if (files != null) { + Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); + } + System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); + f.delete(); + } + } + System.out.println("Deleting " + dir); + dir.toFile().delete(); + } + + private static String fileSize(Path path) { + try { + long sizeBytes = Files.size(path); + + if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; + if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; + if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; + return sizeBytes + "b"; + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + private static String round(double d) { + return String.format("%.2f", d); + } +} diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TransformListTest.java b/libraries/misc/src/test/java/nu/marginalia/util/TransformListTest.java similarity index 81% rename from marginalia_nu/src/test/java/nu/marginalia/util/TransformListTest.java rename to libraries/misc/src/test/java/nu/marginalia/util/TransformListTest.java index 15b1ccde..2a9ea325 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/TransformListTest.java +++ b/libraries/misc/src/test/java/nu/marginalia/util/TransformListTest.java @@ -15,7 +15,7 @@ class TransformListTest { List values = Stream.of(1,2,3,4).collect(Collectors.toList()); new TransformList<>(values).transformEach(e -> { - int v = e.value; + int v = e.value(); if (v == 1) e.remove(); if (v == 2) e.replace(5); if (v == 4) e.remove(); @@ -28,11 +28,11 @@ class TransformListTest { void transformEachPairRemoveReplace() { List values = Stream.of(1,2,3,4,5,6).collect(Collectors.toList()); new TransformList<>(values).transformEachPair((a,b) -> { - System.out.println(a.value + ":" + b.value); - int v = a.value; + System.out.println(a.value() + ":" + b.value()); + int v = a.value(); if (v == 1 || v == 3 || v == 5) { a.remove(); - b.replace(-b.value); + b.replace(-b.value()); } }); @@ -44,8 +44,8 @@ class TransformListTest { void transformEachPairRemoveRemove() { List values = Stream.of(1,2,3,4,5,6).collect(Collectors.toList()); new TransformList<>(values).transformEachPair((a,b) -> { - System.out.println(a.value + ":" + b.value); - int v = a.value; + System.out.println(a.value() + ":" + b.value()); + int v = a.value(); if (v == 1 || v == 3 || v == 5) { a.remove(); b.remove(); @@ -60,10 +60,10 @@ class TransformListTest { void transformEachPairReplaceRemove() { List values = Stream.of(1,2,3,4,5,6).collect(Collectors.toList()); new TransformList<>(values).transformEachPair((a,b) -> { - System.out.println(a.value + ":" + b.value); - int v = a.value; + System.out.println(a.value() + ":" + b.value()); + int v = a.value(); if (v == 1 || v == 3 || v == 5) { - a.replace(-a.value); + a.replace(-a.value()); b.remove(); } @@ -76,11 +76,11 @@ class TransformListTest { void transformEachPairReplaceReplace() { List values = Stream.of(1,2,3,4,5,6).collect(Collectors.toList()); new TransformList<>(values).transformEachPair((a,b) -> { - System.out.println(a.value + ":" + b.value); - int v = a.value; + System.out.println(a.value() + ":" + b.value()); + int v = a.value(); if (v == 1 || v == 3 || v == 5) { - a.replace(-a.value); - b.replace(-b.value); + a.replace(-a.value()); + b.replace(-b.value()); } }); @@ -92,7 +92,7 @@ class TransformListTest { void scanAndTransform() { List values = Stream.of(1,2,3,4,5,6,7,8,9,10).collect(Collectors.toList()); new TransformList<>(values).scanAndTransform(Integer.valueOf(3)::equals, Integer.valueOf(7)::equals, entity -> { - entity.replace(entity.value * 2); + entity.replace(entity.value() * 2); }); assertEquals(List.of(1,2,6,8,10,12,14,8,9,10), values); @@ -102,7 +102,7 @@ class TransformListTest { void scanAndTransformEndsAtEnd() { List values = Stream.of(1,2,3,4,5,6,7,8,9,10).collect(Collectors.toList()); new TransformList<>(values).scanAndTransform(Integer.valueOf(3)::equals, Integer.valueOf(10)::equals, entity -> { - entity.replace(entity.value * 2); + entity.replace(entity.value() * 2); }); assertEquals(List.of(1,2,6,8,10,12,14,16,18,20), values); @@ -112,7 +112,7 @@ class TransformListTest { void scanAndTransformOverlap() { List values = Stream.of(1,2,3,3,5,7,7,8,9,10).collect(Collectors.toList()); new TransformList<>(values).scanAndTransform(Integer.valueOf(3)::equals, Integer.valueOf(7)::equals, entity -> { - entity.replace(entity.value * 2); + entity.replace(entity.value() * 2); }); assertEquals(List.of(1, 2, 6, 6, 10, 14, 7, 8, 9, 10), values); diff --git a/libraries/readme.md b/libraries/readme.md new file mode 100644 index 00000000..1179db65 --- /dev/null +++ b/libraries/readme.md @@ -0,0 +1,9 @@ +# Libraries + +These are libraries that are not strongly coupled to the search engine. + +* The [array](array/) library is for memory mapping large memory-areas, which Java has +bad support for. It's designed to be able to easily replaced when *Java's Foreign Function And Memory API* is released. +* The [btree](btree/) library offers a static BTree implementation based on the array library. +* [language-processing](language-processing/) contains primitives for sentence extraction and POS-tagging. +* [misc](misc/) is just random bits and bobs that didn't fit anywhere. \ No newline at end of file diff --git a/marginalia_nu/data/.gitignore b/marginalia_nu/data/.gitignore deleted file mode 100644 index f59ec20a..00000000 --- a/marginalia_nu/data/.gitignore +++ /dev/null @@ -1 +0,0 @@ -* \ No newline at end of file diff --git a/marginalia_nu/data/models/.gitignore b/marginalia_nu/data/models/.gitignore deleted file mode 100644 index f59ec20a..00000000 --- a/marginalia_nu/data/models/.gitignore +++ /dev/null @@ -1 +0,0 @@ -* \ No newline at end of file diff --git a/marginalia_nu/data/test/.gitignore b/marginalia_nu/data/test/.gitignore deleted file mode 100644 index f59ec20a..00000000 --- a/marginalia_nu/data/test/.gitignore +++ /dev/null @@ -1 +0,0 @@ -* \ No newline at end of file diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java deleted file mode 100644 index da40a7fc..00000000 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java +++ /dev/null @@ -1,86 +0,0 @@ -package nu.marginalia.wmsa.edge; - -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import org.slf4j.LoggerFactory; -import org.testcontainers.containers.BindMode; -import org.testcontainers.containers.GenericContainer; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.containers.Network; -import org.testcontainers.containers.output.Slf4jLogConsumer; -import org.testcontainers.containers.wait.strategy.Wait; -import org.testcontainers.utility.MountableFile; - -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.Duration; - -public abstract class E2ETestBase { - public static Network network = Network.newNetwork(); - - public static MariaDBContainer getMariaDBContainer() { - return new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("sql/edge-crawler-cache.sql") - .withNetwork(network) - .withNetworkAliases("mariadb"); - } - - public static GenericContainer forService(ServiceDescriptor service, GenericContainer mariaDB) { - return new GenericContainer<>("openjdk:17-alpine") - .dependsOn(mariaDB) - .withCopyFileToContainer(jarFile(), "/WMSA.jar") - .withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh") - .withExposedPorts(service.port) - .withFileSystemBind(modelsPath(), "/wmsa/model", BindMode.READ_ONLY) - .withNetwork(network) - .withNetworkAliases(service.name) - .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name))) - .withCommand("sh", "init.sh", service.name) - .waitingFor(Wait.forHttp("/internal/ping") - .forPort(service.port) - .withReadTimeout(Duration.ofSeconds(15))) - ; - } - public static GenericContainer forService(ServiceDescriptor service, GenericContainer mariaDB, String setupScript) { - return new GenericContainer<>("openjdk:17-alpine") - .dependsOn(mariaDB) - .withCopyFileToContainer(jarFile(), "/WMSA.jar") - .withCopyFileToContainer(MountableFile.forClasspathResource(setupScript), "/" + setupScript) - .withExposedPorts(service.port) - .withFileSystemBind(modelsPath(), "/wmsa/model", BindMode.READ_ONLY) - .withNetwork(network) - .withNetworkAliases(service.name) - .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name))) - .withCommand("sh", setupScript, service.name) - .waitingFor(Wait.forHttp("/internal/ping") - .forPort(service.port) - .withReadTimeout(Duration.ofSeconds(15))) - ; - } - - public static MountableFile jarFile() { - Path cwd = Path.of(System.getProperty("user.dir")); - - cwd = cwd.resolve(".."); - var jarFile = cwd.resolve("build/libs/wmsa-SNAPSHOT-all.jar"); - if (!Files.exists(jarFile)) { - System.err.println("Could not find jarFile " + jarFile); - throw new RuntimeException(); - } - else { - System.out.println("jar file = " + jarFile); - } - return MountableFile.forHostPath(jarFile); - } - - public static String modelsPath() { - Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models"); - if (!Files.isDirectory(modelsPath)) { - System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath()); - throw new RuntimeException(); - } - return modelsPath.toString(); - } -} diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeCrawlBehaviorE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeCrawlBehaviorE2ETest.java deleted file mode 100644 index f8325d9d..00000000 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeCrawlBehaviorE2ETest.java +++ /dev/null @@ -1,105 +0,0 @@ -package nu.marginalia.wmsa.edge; - - -import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain; -import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.slf4j.LoggerFactory; -import org.testcontainers.containers.BindMode; -import org.testcontainers.containers.GenericContainer; -import org.testcontainers.containers.output.Slf4jLogConsumer; -import org.testcontainers.containers.wait.strategy.Wait; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; -import org.testcontainers.utility.MountableFile; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.Duration; -import java.util.ArrayList; -import java.util.List; - -@Tag("e2e") -@Testcontainers -public class EdgeCrawlBehaviorE2ETest extends E2ETestBase { - @Container - public static GenericContainer mockContainer = new GenericContainer<>("openjdk:17-alpine") - .withCopyFileToContainer(jarFile(), "/WMSA.jar") - .withNetwork(network) - .withNetworkAliases("mock", "mock2") - .withExposedPorts(8080) - .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("mock"))) - .withCommand("java","-cp","WMSA.jar","nu.marginalia.wmsa.edge.crawling.CrawlerTestMain") - ; - - - @Container - public static GenericContainer crawlerContainer = new GenericContainer<>("openjdk:17-alpine") - .dependsOn(mockContainer) - .withNetwork(network) - .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler"))) - .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) - .withCopyFileToContainer(ipDatabasePath(), "/var/lib/wmsa/data/IP2LOCATION-LITE-DB1.CSV") - .withCopyFileToContainer(jarFile(), "/WMSA.jar") - .withCopyFileToContainer(MountableFile.forClasspathResource("crawl-mock.sh"), "/crawl-mock.sh") - .withFileSystemBind(getMockCrawlPath(), "/crawl/", BindMode.READ_WRITE) - .withCommand("sh", "crawl-mock.sh") - .waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10))); - - - private static String getMockCrawlPath() { - Path crawlFiles = getCrawlPath(); - - - List urls = new ArrayList<>(); - try { - Files.createDirectories(crawlFiles); - - Files.writeString(crawlFiles.resolve("crawl.plan"), """ - jobSpec: "/crawl/crawl.spec" - crawl: - dir: "/crawl/crawl" - logName: "crawl.log" - process: - dir: "/crawl/process" - logName: "process.log" - """); - - Files.createDirectories(crawlFiles.resolve("crawl")); - Files.createDirectories(crawlFiles.resolve("process")); - Files.deleteIfExists(crawlFiles.resolve("process").resolve("process.log")); - Files.deleteIfExists(crawlFiles.resolve("crawl").resolve("crawl.log")); - - CrawlJobExtractorMain.writeSpec(crawlFiles.resolve("crawl.spec"), - new CrawlingSpecification("111111", 20, "mock", List.of("http://mock:8080/rate-limit/")), - new CrawlingSpecification("222222", 20, "mock2", List.of("http://mock2:8080/intermittent-error/"))); - } - catch (IOException ex) { - ex.printStackTrace(); - } - return crawlFiles.toString(); - } - - - public static MountableFile ipDatabasePath() { - Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models/IP2LOC/IP2LOCATION-LITE-DB1.CSV"); - if (!Files.isRegularFile(modelsPath)) { - System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath()); - throw new RuntimeException(); - } - return MountableFile.forHostPath(modelsPath.toString()); - } - - private static Path getCrawlPath() { - return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl"); - } - - @Test - public void testRunTheThing() throws IOException { - // This is a test for examining the interaction between the crawler and various - // set-ups - } - -} diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java deleted file mode 100644 index 1e1fad4b..00000000 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ /dev/null @@ -1,284 +0,0 @@ -package nu.marginalia.wmsa.edge; - - -import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain; -import org.jsoup.Jsoup; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.openqa.selenium.By; -import org.openqa.selenium.OutputType; -import org.openqa.selenium.chrome.ChromeOptions; -import org.openzim.ZIMTypes.ZIMFile; -import org.openzim.ZIMTypes.ZIMReader; -import org.slf4j.LoggerFactory; -import org.testcontainers.containers.BindMode; -import org.testcontainers.containers.BrowserWebDriverContainer; -import org.testcontainers.containers.GenericContainer; -import org.testcontainers.containers.NginxContainer; -import org.testcontainers.containers.output.Slf4jLogConsumer; -import org.testcontainers.containers.wait.strategy.Wait; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; -import org.testcontainers.utility.MountableFile; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.Duration; -import java.time.LocalDateTime; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*; -import static org.junit.jupiter.api.Assertions.assertEquals; - -@Tag("e2e") -@Testcontainers -public class EdgeSearchE2ETest extends E2ETestBase { - @Container - public static GenericContainer mariaDB = getMariaDBContainer(); - - @Container - public static GenericContainer searchContainer = forService(EDGE_SEARCH, mariaDB); - @Container - public static GenericContainer assistantContainer = forService(EDGE_ASSISTANT, mariaDB); - @Container - public static GenericContainer indexContainer = forService(EDGE_INDEX, mariaDB); - - @Container - public static NginxContainer mockWikipedia = new NginxContainer<>("nginx:stable") - .dependsOn(searchContainer) - .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("wikipedia"))) - .withFileSystemBind(getWikipediaFiles(), "/usr/share/nginx/html/", BindMode.READ_ONLY) - .withNetwork(network) - .withNetworkAliases("wikipedia.local"); - - - @Container - public static BrowserWebDriverContainer chrome = new BrowserWebDriverContainer<>() - .withNetwork(network) - .withCapabilities(new ChromeOptions()); - - @Container - public static GenericContainer crawlerContainer = new GenericContainer<>("openjdk:17-alpine") - .dependsOn(mockWikipedia) - .dependsOn(indexContainer) - .withNetwork(network) - .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler"))) - .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) - .withCopyFileToContainer(ipDatabasePath(), "/var/lib/wmsa/data/IP2LOCATION-LITE-DB1.CSV") - .withCopyFileToContainer(jarFile(), "/WMSA.jar") - .withCopyFileToContainer(MountableFile.forClasspathResource("crawl.sh"), "/crawl.sh") - .withFileSystemBind(getCrawlPath().toString(), "/crawl/", BindMode.READ_WRITE) - .withCommand("sh", "crawl.sh") - .waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10))); - - @Container - public static NginxContainer proxyNginx = new NginxContainer<>("nginx:stable") - .dependsOn(searchContainer) - .dependsOn(crawlerContainer) - .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx"))) - .withCopyFileToContainer(MountableFile.forClasspathResource("nginx/search.conf"), "/etc/nginx/conf.d/default.conf") - .withNetwork(network) - .withNetworkAliases("proxyNginx"); - - public static MountableFile ipDatabasePath() { - Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models/IP2LOC/IP2LOCATION-LITE-DB1.CSV"); - if (!Files.isRegularFile(modelsPath)) { - System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath()); - throw new RuntimeException(); - } - return MountableFile.forHostPath(modelsPath.toString()); - } - - private static Path getCrawlPath() { - return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl"); - } - - private static Path screenshotFilename(String operation) throws IOException { - var path = Path.of(System.getProperty("user.dir")).resolve("build/test/e2e/"); - Files.createDirectories(path); - - String name = String.format("test-%s-%s.png", operation, LocalDateTime.now()); - path = path.resolve(name); - - System.out.println("Screenshot in " + path); - return path; - } - - private static String getWikipediaFiles() { - Path wikipediaFiles = Path.of(System.getProperty("user.dir")).resolve("build/tmp/wikipedia"); - Path crawlFiles = getCrawlPath(); - Path zimFile = Path.of(System.getProperty("user.dir")).resolve("data/test/wikipedia_en_100_nopic.zim"); - - - List urls = new ArrayList<>(); - try { - TestUtil.clearTempDir(wikipediaFiles); - Files.createDirectories(wikipediaFiles); - Files.createDirectories(crawlFiles); - - Files.writeString(crawlFiles.resolve("crawl.plan"), """ - jobSpec: "/crawl/crawl.spec" - crawl: - dir: "/crawl/crawl" - logName: "crawl.log" - process: - dir: "/crawl/process" - logName: "process.log" - """); - - Files.createDirectories(crawlFiles.resolve("crawl")); - Files.createDirectories(crawlFiles.resolve("process")); - Files.deleteIfExists(crawlFiles.resolve("process").resolve("process.log")); - Files.deleteIfExists(crawlFiles.resolve("crawl").resolve("crawl.log")); - - var zr = new ZIMReader(new ZIMFile(zimFile.toString())); - zr.forEachArticles((url, art) -> { - urls.add("http://wikipedia.local/" + url + ".html"); - - if (art != null) { - try { - var doc = Jsoup.parse(art); - doc.getElementsByTag("script").remove(); - Files.writeString(wikipediaFiles.resolve(url+".html"), doc.html()); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }, pred -> true); - urls.forEach(System.out::println); - Files.writeString(wikipediaFiles.resolve("index.html"), ""); - CrawlJobExtractorMain.writeSpec(crawlFiles.resolve("crawl.spec"), "wikipedia.local", urls); - } - catch (IOException ex) { - ex.printStackTrace(); - } - return wikipediaFiles.toString(); - } - - private List getTitlesFromSearchResults(String html) { - List ret = new ArrayList<>(); - - for (var title : Jsoup.parse(html).select(".card.search-result > h2")) { - ret.add(title.text()); - } - - return ret; - } - - @Test - public void testFrontPage() throws IOException { - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/"); - System.out.println(driver.getTitle()); -// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("frontpage")); - } - - @Test - public void testQuery() throws IOException { - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/search?query=bird&profile=corpo"); - System.out.println(driver.getTitle()); - - var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); - assertEquals(List.of("Bird"), getTitlesFromSearchResults(html)); - - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query")); - } - - @Test - public void testQueryYesJs() throws IOException { - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/search?query=bird&profile=corpo&js=yes-js"); - System.out.println(driver.getTitle()); - - var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); - assertEquals(Collections.emptyList(), getTitlesFromSearchResults(html)); - - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-yes-js")); - } - - @Test - public void testQueryNoJs() throws IOException { - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/search?query=bird&profile=corpo&js=no-js"); - System.out.println(driver.getTitle()); - - var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); - assertEquals(List.of("Bird"), getTitlesFromSearchResults(html)); - - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-no-js")); - } - @Test - public void testSiteInfo() throws IOException { - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/search?query=site:wikipedia.local"); - System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info")); - } - - @Test - public void testSiteSearch() throws IOException { - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog"); - System.out.println(driver.getTitle()); - - var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); - - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search")); - - assertEquals(List.of("Frog", "Amphibian"), getTitlesFromSearchResults(html)); - } - - @Test - public void testBrowse() throws IOException { - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/search?query=browse:wikipedia.local"); - System.out.println(driver.getTitle()); -// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse")); - } - @Test - public void testDefine() throws IOException { - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/search?query=define:adiabatic"); - System.out.println(driver.getTitle()); -// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define")); - } - @Test - public void testEval() throws IOException { - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/search?query=3%2B3"); - System.out.println(driver.getTitle()); -// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("eval")); - } - @Test - public void testBang() throws IOException { - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/search?query=!g test"); - - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("bang")); - } -} diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java deleted file mode 100644 index 4afa18c4..00000000 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java +++ /dev/null @@ -1,154 +0,0 @@ -package nu.marginalia.wmsa.edge; - - -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; -import okhttp3.OkHttpClient; -import okhttp3.Request; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.mariadb.jdbc.Driver; -import org.openqa.selenium.OutputType; -import org.openqa.selenium.chrome.ChromeOptions; -import org.slf4j.LoggerFactory; -import org.testcontainers.containers.*; -import org.testcontainers.containers.output.Slf4jLogConsumer; -import org.testcontainers.containers.wait.strategy.Wait; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; -import org.testcontainers.utility.MountableFile; - -import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URL; -import java.nio.file.Files; -import java.nio.file.Path; -import java.sql.DriverManager; -import java.sql.SQLException; -import java.sql.Types; -import java.time.Duration; -import java.time.LocalDateTime; -import java.util.concurrent.TimeUnit; - -import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA; - -@Tag("e2e") -@Testcontainers -public class EncyclopediaE2ETest extends E2ETestBase { - @Container - public MariaDBContainer mariaDB = getMariaDBContainer(); - - @Container - public GenericContainer encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB); - @Container - public GenericContainer encyclopediaLoader = new GenericContainer<>("openjdk:17") - .dependsOn(encyclopediaContainer) - .dependsOn(mariaDB) - .withNetwork(network) - .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("encyclopedia-loader"))) - .withCopyFileToContainer(jarFile(), "/WMSA.jar") - .withCopyFileToContainer(MountableFile.forClasspathResource("load-encyclopedia.sh"), "/load-encyclopedia.sh") - .withFileSystemBind(getModelData().toString(), "/data", BindMode.READ_ONLY) - .withCommand("sh", "load-encyclopedia.sh") - .waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10))); - - @Container - public NginxContainer proxyNginx = new NginxContainer<>("nginx:stable") - .dependsOn(encyclopediaLoader) - .dependsOn(encyclopediaContainer) - .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx"))) - .withCopyFileToContainer(MountableFile.forClasspathResource("nginx/encyclopedia.conf"), "/etc/nginx/conf.d/default.conf") - .withNetwork(network) - .withNetworkAliases("proxyNginx"); - - @Container - public BrowserWebDriverContainer chrome = new BrowserWebDriverContainer<>() - .withNetwork(network) - .withCapabilities(new ChromeOptions()); - - private Gson gson = new GsonBuilder().create(); - private OkHttpClient httpClient = new OkHttpClient.Builder() - .connectTimeout(100, TimeUnit.MILLISECONDS) - .readTimeout(6000, TimeUnit.SECONDS) - .retryOnConnectionFailure(true) - .followRedirects(true) - .build(); - - private Path getModelData() { - return Path.of(System.getProperty("user.dir")).resolve("data/test"); - } - - private static Path screenshotFilename(String operation) throws IOException { - var path = Path.of(System.getProperty("user.dir")).resolve("build/test/e2e/"); - Files.createDirectories(path); - - String name = String.format("test-encyclopedia-%s-%s.png", operation, LocalDateTime.now()); - path = path.resolve(name); - - System.out.println("Screenshot in " + path); - return path; - } - - @Test - public void run() throws IOException { - new Driver(); - - try (var conn = DriverManager.getConnection(mariaDB.getJdbcUrl(), "wmsa", "wmsa"); - var stmt = conn.prepareStatement("INSERT IGNORE INTO REF_WIKI_ARTICLE(NAME,REF_NAME) VALUES (?,?)")) { - - stmt.setString(1, "Forg"); - stmt.setString(2, "Frog"); - stmt.executeUpdate(); - - stmt.setString(1, "Frog"); - stmt.setNull(2, Types.VARCHAR); - stmt.executeUpdate(); - - } catch (SQLException e) { - throw new RuntimeException(e); - } - - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/wiki/Frog"); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("get-article")); - - driver.get("http://proxyNginx/wiki/Forg"); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("get-article-redir")); - - System.out.println(driver.getTitle()); - driver.get("http://proxyNginx/wiki-search?query=Forg"); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("disambig")); - System.out.println(driver.getTitle()); - - var resultsForMarginalia = get(encyclopediaContainer.getHost(), - encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), - "/encyclopedia/Marginalia", WikiArticles.class); - Assertions.assertTrue(resultsForMarginalia.getEntries().isEmpty()); - - var resultsForFrog = get(encyclopediaContainer.getHost(), - encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), - "/encyclopedia/Frog", WikiArticles.class); - Assertions.assertFalse(resultsForFrog.getEntries().isEmpty()); - - var resultsForFoRg = get(encyclopediaContainer.getHost(), - encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), - "/encyclopedia/Forg", WikiArticles.class); - Assertions.assertFalse(resultsForFoRg.getEntries().isEmpty()); - - - } - - - private T get(String host, Integer mappedPort, String path, Class clazz) throws MalformedURLException { - var req = new Request.Builder().get().url(new URL("http", host, mappedPort, path)).build(); - var call = httpClient.newCall(req); - try (var rsp = call.execute()) { - return gson.fromJson(rsp.body().charStream(), clazz); - } catch (IOException e) { - throw new RuntimeException(e); - } - } -} diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/MemexE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/MemexE2ETest.java deleted file mode 100644 index 7410b3b3..00000000 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/MemexE2ETest.java +++ /dev/null @@ -1,95 +0,0 @@ -package nu.marginalia.wmsa.edge; - - -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import okhttp3.OkHttpClient; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.mariadb.jdbc.Driver; -import org.openqa.selenium.OutputType; -import org.openqa.selenium.chrome.ChromeOptions; -import org.slf4j.LoggerFactory; -import org.testcontainers.containers.*; -import org.testcontainers.containers.output.Slf4jLogConsumer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; -import org.testcontainers.utility.MountableFile; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.LocalDateTime; -import java.util.concurrent.TimeUnit; - -import static nu.marginalia.wmsa.configuration.ServiceDescriptor.AUTH; -import static nu.marginalia.wmsa.configuration.ServiceDescriptor.MEMEX; - -@Tag("e2e") -@Testcontainers -public class MemexE2ETest extends E2ETestBase { - @Container - public MariaDBContainer mariaDB = getMariaDBContainer(); - - @Container - public GenericContainer auth = forService(AUTH, mariaDB); - - @Container - public GenericContainer memexContainer = forService(MEMEX, mariaDB, "memex.sh") - .withClasspathResourceMapping("/memex", "/memex", BindMode.READ_ONLY); - - @Container - public NginxContainer proxyNginx = new NginxContainer<>("nginx:stable") - .dependsOn(auth) - .dependsOn(memexContainer) - .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx"))) - .withCopyFileToContainer(MountableFile.forClasspathResource("nginx/memex.conf"), "/etc/nginx/conf.d/default.conf") - .withNetwork(network) - .withNetworkAliases("proxyNginx"); - - @Container - public BrowserWebDriverContainer chrome = new BrowserWebDriverContainer<>() - .withNetwork(network) - .withCapabilities(new ChromeOptions()); - - private Gson gson = new GsonBuilder().create(); - private OkHttpClient httpClient = new OkHttpClient.Builder() - .connectTimeout(100, TimeUnit.MILLISECONDS) - .readTimeout(6000, TimeUnit.SECONDS) - .retryOnConnectionFailure(true) - .followRedirects(true) - .build(); - - @Test - public void run() throws IOException, InterruptedException { - Thread.sleep(10_000); - new Driver(); - - var driver = chrome.getWebDriver(); - - driver.get("http://proxyNginx/"); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("frontpage")); - - driver.get("http://proxyNginx/log/"); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("log")); - - driver.get("http://proxyNginx/log/a.gmi"); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("log-a.gmi")); - - driver.get("http://proxyNginx/log/b.gmi"); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("log-b.gmi")); - } - - private static Path screenshotFilename(String operation) throws IOException { - var path = Path.of(System.getProperty("user.dir")).resolve("build/test/e2e/"); - Files.createDirectories(path); - - String name = String.format("test-%s-%s.png", operation, LocalDateTime.now()); - path = path.resolve(name); - - System.out.println("Screenshot in " + path); - return path; - } - - -} diff --git a/marginalia_nu/src/e2e/resources/crawl-mock.sh b/marginalia_nu/src/e2e/resources/crawl-mock.sh deleted file mode 100644 index 4270929e..00000000 --- a/marginalia_nu/src/e2e/resources/crawl-mock.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -mkdir -p /var/lib/wmsa/conf/ -mkdir -p /var/lib/wmsa/data/ - -echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent - -cat crawl/crawl.plan -cat << EOF - #### ##### ## # # # - # # # # # # # # # - # # # # # # # # - # ##### ###### # ## # # - # # # # # # ## ## # - #### # # # # # # ###### -EOF -java -jar WMSA.jar crawl crawl/crawl.plan - -echo "ALL DONE" \ No newline at end of file diff --git a/marginalia_nu/src/e2e/resources/crawl.sh b/marginalia_nu/src/e2e/resources/crawl.sh deleted file mode 100644 index 16d43fab..00000000 --- a/marginalia_nu/src/e2e/resources/crawl.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash - -mkdir -p /var/lib/wmsa/conf/ -mkdir -p /var/lib/wmsa/data/ - -echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent - -cat > /var/lib/wmsa/conf/db.properties < /var/lib/wmsa/conf/hosts < ${HOME}/suggestions.txt < ${HOME}/conf/disks.properties < ${HOME}/conf/db.properties < ${HOME}/conf/ranking-settings.yaml < ${HOME}/conf/hosts < /var/lib/wmsa/conf/db.properties < /var/lib/wmsa/conf/hosts < ${HOME}/conf/db.properties < ${HOME}/conf/hosts < roar = new ArrayList<>(); - List acbs = new ArrayList<>(); - - List roarLow = new ArrayList<>(); - List roarHigh = new ArrayList<>(); - - List acbsLow = new ArrayList<>(); - List acbsHigh = new ArrayList<>(); - - @Setup(Level.Trial) - public void setUp() { - var rand = new Random(); - - for (int i = 0; i < 100; i++) { - int card = 1 + rand.nextInt(10); - - var rb = new RoaringBitmap(); - var cbs = new AndCardIntSet(); - - for (int j = 0; j < card; j++) { - int val = rand.nextInt(1_000_000); - rb.add(val); - cbs.add(val); - } - acbsLow.add(cbs); - roarLow.add(rb); - } - - for (int i = 0; i < 10; i++) { - int card = 1 + rand.nextInt(10000, 20000); - - var rb = new RoaringBitmap(); - - for (int j = 0; j < card; j++) { - int val = rand.nextInt(1_000_000); - rb.add(val); - } - acbsHigh.add(AndCardIntSet.of(rb)); - roarHigh.add(rb); - } - - - - for (int i = 0; i < 100000; i++) { - var rb = new RoaringBitmap(); - var cbs = new AndCardIntSet(); - - int val = rand.nextInt(1_000_000); - rb.add(val); - cbs.add(val); - - acbs.add(cbs); - roar.add(rb); - } - - for (int i = 0; i < 10000; i++) { - int card = 1 + rand.nextInt(10); - - var rb = new RoaringBitmap(); - var cbs = new AndCardIntSet(); - - for (int j = 0; j < card; j++) { - int val = rand.nextInt(1_000_000); - rb.add(val); - cbs.add(val); - } - acbs.add(cbs); - roar.add(rb); - } - for (int i = 0; i < 1000; i++) { - int card = 1 + rand.nextInt(100); - - var rb = new RoaringBitmap(); - var cbs = new AndCardIntSet(); - - for (int j = 0; j < card; j++) { - int val = rand.nextInt(1_000_000); - rb.add(val); - cbs.add(val); - } - acbs.add(cbs); - roar.add(rb); - } - for (int i = 0; i < 100; i++) { - int card = 1 + rand.nextInt(1000); - - var rb = new RoaringBitmap(); - var cbs = new AndCardIntSet(); - - for (int j = 0; j < card; j++) { - int val = rand.nextInt(1_000_000); - rb.add(val); - cbs.add(val); - } - acbs.add(cbs); - roar.add(rb); - } - for (int i = 0; i < 100; i++) { - int card = 1 + rand.nextInt(10000); - - var rb = new RoaringBitmap(); - var cbs = new AndCardIntSet(); - - for (int j = 0; j < card; j++) { - int val = rand.nextInt(1_000_000); - rb.add(val); - cbs.add(val); - } - acbs.add(cbs); - roar.add(rb); - } - - for (int i = 0; i < 2; i++) { - int card = 1 + rand.nextInt(100000); - - var rb = new RoaringBitmap(); - var cbs = new AndCardIntSet(); - - for (int j = 0; j < card; j++) { - int val = rand.nextInt(1_000_000); - rb.add(val); - cbs.add(val); - } - acbs.add(cbs); - roar.add(rb); - } - Collections.shuffle(acbs); - Collections.shuffle(roar); - } - } - -// -// @Benchmark -// @BenchmarkMode(Mode.Throughput) -// @Fork(value = 5, warmups = 5) -// public Object roaringCard(State state) { -// long val = 0; -// -// for (int i = 0; i < state.roar.size(); i++) { -// for (int j = i+1; j < state.roar.size(); j++) { -// val += RoaringBitmap.andCardinality(state.roar.get(i), state.roar.get(j)); -// } -// } -// -// return val; -// } -// @Benchmark -// @BenchmarkMode(Mode.Throughput) -// @Fork(value = 2, warmups = 2) -// public Object roaringCardNorm(State state) { -// long val = 0; -// -// for (int i = 0; i < state.roar.size()/1000; i++) { -// for (int j = i+1; j < state.roar.size(); j++) { -// -// var a = state.roar.get(i); -// var b = state.roar.get(j); -// val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); -// } -// } -// -// return val; -// } -// @Benchmark -// @BenchmarkMode(Mode.Throughput) -// @Fork(value = 5, warmups = 5) -// public Object cbsCard(State state) { -// long val = 0; -// -// for (int i = 0; i < state.roar.size(); i++) { -// for (int j = i+1; j < state.roar.size(); j++) { -// val += AndCardIntSet.andCardinality(state.acbs.get(i), state.acbs.get(j)); -// } -// } -// -// return val; -// } -// -// @Benchmark -// @BenchmarkMode(Mode.Throughput) -// @Fork(value = 1, warmups = 1) -// public Object cbsCardNorm(State state) { -// double val = 0; -// -// for (int i = 0; i < state.roar.size()/1000; i++) { -// for (int j = i+1; j < state.roar.size(); j++) { -// var a = state.acbs.get(i); -// var b = state.acbs.get(j); -// val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.cardinality()*b.cardinality())); -// } -// } -// -// return val; -// } - - @Benchmark - @BenchmarkMode(Mode.Throughput) - @Fork(value = 1, warmups = 1) - public Object cbsLowLow(State state) { - double val = 0; - - for (int i = 0; i < state.acbsLow.size(); i++) { - for (int j = 0; j < state.acbsLow.size(); j++) { - var a = state.acbsLow.get(i); - var b = state.acbsLow.get(j); - val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); - } - } - - return val; - } - - - @Benchmark - @BenchmarkMode(Mode.Throughput) - @Fork(value = 1, warmups = 1) - public Object cbsHighHigh(State state) { - double val = 0; - - for (int i = 0; i < state.acbsHigh.size(); i++) { - for (int j = 0; j < state.acbsHigh.size(); j++) { - var a = state.acbsHigh.get(i); - var b = state.acbsHigh.get(j); - val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); - } - } - - return val; - } - - @Benchmark - @BenchmarkMode(Mode.Throughput) - @Fork(value = 1, warmups = 1) - public Object cbsHighLow(State state) { - double val = 0; - - for (int i = 0; i < state.acbsHigh.size(); i++) { - for (int j = 0; j < state.acbsLow.size(); j++) { - var a = state.acbsHigh.get(i); - var b = state.acbsLow.get(j); - val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); - } - } - - return val; - } - - @Benchmark - @BenchmarkMode(Mode.Throughput) - @Fork(value = 1, warmups = 1) - public Object roarLowLow(State state) { - double val = 0; - - for (int i = 0; i < state.roarLow.size(); i++) { - for (int j = 0; j < state.roarLow.size(); j++) { - var a = state.roarLow.get(i); - var b = state.roarLow.get(j); - val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); - } - } - - return val; - } - - - @Benchmark - @BenchmarkMode(Mode.Throughput) - @Fork(value = 1, warmups = 1) - public Object roarHighLow(State state) { - double val = 0; - - for (int i = 0; i < state.roarHigh.size(); i++) { - for (int j = 0; j < state.roarLow.size(); j++) { - var a = state.roarHigh.get(i); - var b = state.roarLow.get(j); - val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); - } - } - - return val; - } - - @Benchmark - @BenchmarkMode(Mode.Throughput) - @Fork(value = 1, warmups = 1) - public Object roarHighHigh(State state) { - double val = 0; - - for (int i = 0; i < state.roarHigh.size(); i++) { - for (int j = 0; j < state.roarHigh.size(); j++) { - var a = state.roarHigh.get(i); - var b = state.roarHigh.get(j); - val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality())); - } - } - - return val; - } -} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/SeekDictionary.java b/marginalia_nu/src/main/java/nu/marginalia/util/SeekDictionary.java deleted file mode 100644 index a49544e4..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/SeekDictionary.java +++ /dev/null @@ -1,73 +0,0 @@ -package nu.marginalia.util; - -import gnu.trove.list.array.TIntArrayList; - -import java.util.ArrayList; -import java.util.function.ToIntFunction; - -public abstract class SeekDictionary { - private final ArrayList banks = new ArrayList<>(); - private final TIntArrayList offsets = new TIntArrayList(); - - public static SeekDictionary of(ToIntFunction length) { - return new SeekDictionary() { - @Override - public int length(T obj) { - return length.applyAsInt(obj); - } - }; - } - public T last() { - return banks.get(banks.size()-1); - } - public int lastStart() { - return offsets.get(offsets.size()-1); - } - - public abstract int length(T obj); - public int end() { - if (banks.isEmpty()) return 0; - - return (offsets.getQuick(offsets.size()-1) + length(last())); - } - - public void add(T obj) { - - if (banks.isEmpty()) { - banks.add(obj); - offsets.add(0); - } - else { - offsets.add(end()); - banks.add(obj); - } - } - - public T bankForOffset(int offset) { - return banks.get(idxForOffset(offset)); - } - - public int idxForOffset(int offset) { - - int high = offsets.size() - 1; - int low = 0; - - while ( low <= high ) { - int mid = ( low + high ) >>> 1; - int midVal = offsets.getQuick(mid); - - if ( midVal < offset ) { - low = mid + 1; - } - else if ( midVal > offset ) { - high = mid - 1; - } - else { - return mid; - } - } - return low-1; - - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java deleted file mode 100644 index 77a97e87..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java +++ /dev/null @@ -1,35 +0,0 @@ -package nu.marginalia.util.btree; - -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.btree.model.BTreeHeader; - -/* - * End-of-page mark that's used as a sentinel to verify that - * the BTreeWriter's caller actually writes as much as they say - * they want to. (Failing to do so will corrupt the tree) - * - */ -public class BTreeDogEar { - - private LongArray sentinelSlice; - - public BTreeDogEar(BTreeContext ctx, BTreeHeader header, LongArray base) { - if (header.numEntries() > 3) { - sentinelSlice = base.range( - (long) header.numEntries() * ctx.entrySize() - 3, - (long) header.numEntries() * ctx.entrySize()); - sentinelSlice.set(0, 4L); - sentinelSlice.set(1, 5L); - sentinelSlice.set(2, 1L); - } - } - - public boolean verify() { - if (sentinelSlice == null) - return true; - - return 4 != sentinelSlice.get(0) || 5 != sentinelSlice.get(1) || 1 != sentinelSlice.get(2); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReaderIf.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReaderIf.java deleted file mode 100644 index c4b40386..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReaderIf.java +++ /dev/null @@ -1,21 +0,0 @@ -package nu.marginalia.util.btree; - -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.btree.model.BTreeHeader; - -public interface BTreeReaderIf { - BTreeHeader getHeader(); - - int numEntries(); - - void retainEntries(LongQueryBuffer buffer); - - void rejectEntries(LongQueryBuffer buffer); - - long findEntry(long keyRaw); - - void readData(long[] data, int n, long pos); - - long[] queryData(long[] urls, int offset); - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java deleted file mode 100644 index 6c51cdde..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java +++ /dev/null @@ -1,9 +0,0 @@ -package nu.marginalia.util.btree; - -import nu.marginalia.util.array.LongArray; - -import java.io.IOException; - -public interface WriteCallback { - void write(LongArray slice) throws IOException; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java deleted file mode 100644 index 492417a0..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java +++ /dev/null @@ -1,100 +0,0 @@ -package nu.marginalia.util.dict; - -import java.nio.ByteBuffer; -import java.nio.LongBuffer; -import java.util.ArrayList; - -public class DictionaryData { - - private final int DICTIONARY_BANK_SIZE; - - private final ArrayList banks = new ArrayList<>(100); - - public DictionaryData(int bankSize) { - DICTIONARY_BANK_SIZE = bankSize; - - banks.add(new DictionaryDataBank(0, bankSize)); - } - - public int add(long key) { - var activeBank = banks.get(banks.size()-1); - int rb = activeBank.add(key); - - if (rb == -1) { - int end = activeBank.getEnd(); - var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE); - rb = newBank.add(key); - - banks.add(newBank); - } - - return rb; - } - - - public long getKey(int offset) { - return banks.get(offset/DICTIONARY_BANK_SIZE).getKey(offset); - } - public boolean keyEquals(int offset, long otherKey) { - return banks.get(offset/DICTIONARY_BANK_SIZE).keyEquals(offset, otherKey); - } - - private static class DictionaryDataBank { - - private final int start_idx; - - // Humongous long-lived arrays seem to sometimes yield considerable memory overhead and - // can make the GC behave poorly. Using off-heap memory seems preferred when their - // lifetime is "forever" - - private final LongBuffer keys; - - private int size; - private final int capacity; - - - public DictionaryDataBank(int start_idx, int sz) { - this.start_idx = start_idx; - this.capacity = sz; - - keys = ByteBuffer.allocateDirect(8*capacity).asLongBuffer(); - size = 0; - } - - public int getStart() { - return start_idx; - } - - public int getEnd() { - return start_idx + size; - } - - public long getKey(int idx) { - if (idx < start_idx || idx - start_idx >= size) { - throw new IndexOutOfBoundsException(idx); - } - return keys.get(idx - start_idx); - } - - public boolean keyEquals(int idx, long other) { - if (idx < start_idx || idx - start_idx >= size) { - throw new IndexOutOfBoundsException(idx); - } - - return keys.get(idx - start_idx) == other; - } - - public int add(long newKey) { - if (size >= capacity) - return -1; - - keys.put(size, newKey); - - return start_idx + size++; - } - - public int getSize() { - return size; - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/conf/LanguageModels.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/conf/LanguageModels.java deleted file mode 100644 index 65cb21cb..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/conf/LanguageModels.java +++ /dev/null @@ -1,16 +0,0 @@ -package nu.marginalia.util.language.conf; - -import lombok.AllArgsConstructor; - -import java.nio.file.Path; - -@AllArgsConstructor -public class LanguageModels { - public final Path ngramBloomFilter; - public final Path termFrequencies; - - public final Path openNLPSentenceDetectionData; - public final Path posRules; - public final Path posDict; - public final Path openNLPTokenData; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/AsciiFlattener.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/AsciiFlattener.java deleted file mode 100644 index 6abcbdb5..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/AsciiFlattener.java +++ /dev/null @@ -1,58 +0,0 @@ -package nu.marginalia.util.language.processing; - -import java.util.regex.Pattern; - -public class AsciiFlattener { - - private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+"); - - private static boolean isPlainAscii(String s) { - for (int i = 0; i < s.length(); i++) { - char c = s.charAt(i); - if ((c & 0x80) != 0) { - return false; - } - } - return true; - } - public static String flattenUnicode(String s) { - - if (isPlainAscii(s)) { - return s; - } - - var cdata = s.toCharArray(); - var newCdata = new char[cdata.length]; - for (int i = 0; i < cdata.length; i++) { - if ("àáâãäåæ".indexOf(cdata[i]) >= 0) { - newCdata[i] = 'a'; - } - else if ("ç".indexOf(cdata[i]) >= 0) { - newCdata[i] = 'g'; - } - else if ("òóôõöø".indexOf(cdata[i]) >= 0) { - newCdata[i] = 'o'; - } - else if ("ùúûü".indexOf(cdata[i]) >= 0) { - newCdata[i] = 'u'; - } - else if ("ýÿÞþ".indexOf(cdata[i]) >= 0) { - newCdata[i] = 'y'; - } - else if ("ìíîï".indexOf(cdata[i]) >= 0) { - newCdata[i] = 'i'; - } - else if ("èéêë".indexOf(cdata[i]) >= 0) { - newCdata[i] = 'e'; - } - else if ("ß".indexOf(cdata[i]) >= 0) { - newCdata[i] = 's'; - } - else { - newCdata[i] = cdata[i]; - } - } - return nonAscii.matcher(new String(newCdata)).replaceAll(""); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRef.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRef.java deleted file mode 100644 index 9ed6f1ae..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRef.java +++ /dev/null @@ -1,46 +0,0 @@ -package nu.marginalia.util.language.processing.model; - -import lombok.AllArgsConstructor; - -import java.util.Objects; -import java.util.Optional; - -@AllArgsConstructor -public class WordRef { - public final int sentenceIndex; - public final int wordIndex; - - public String getWord(DocumentLanguageData dld) { - return dld.sentences[sentenceIndex].words[wordIndex]; - } - - public String getWordStemmed(DocumentLanguageData dld) { - return dld.sentences[sentenceIndex].stemmedWords[wordIndex]; - } - - public Optional next(DocumentLanguageData dld) { - if (wordIndex + 1 < dld.sentences[sentenceIndex].length()) { - return Optional.of(new WordRef(sentenceIndex, wordIndex+1)); - } - return Optional.empty(); - } - public Optional prev() { - if (wordIndex - 1 >= 0) { - return Optional.of(new WordRef(sentenceIndex, wordIndex-1)); - } - return Optional.empty(); - } - - @Override - public int hashCode() { - return Objects.hash(sentenceIndex, wordIndex); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - WordRef wordRef = (WordRef) o; - return sentenceIndex == wordRef.sentenceIndex && wordIndex == wordRef.wordIndex; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/tag/WordTag.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/tag/WordTag.java deleted file mode 100644 index bbe974a6..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/tag/WordTag.java +++ /dev/null @@ -1,8 +0,0 @@ -package nu.marginalia.util.language.processing.model.tag; - -public class WordTag { - public static int UNSET = 0; - public static int STOP_WORD = 1; - public static int NAME = 2; - public static int NOT_NAME = 3; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/tool/WikipediaInternalLinkExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/util/tool/WikipediaInternalLinkExtractorMain.java deleted file mode 100644 index f4f9b3dc..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/tool/WikipediaInternalLinkExtractorMain.java +++ /dev/null @@ -1,43 +0,0 @@ -package nu.marginalia.util.tool; - -import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import org.jsoup.Jsoup; - -import java.util.HashSet; -import java.util.Set; - -public class WikipediaInternalLinkExtractorMain { - public static void main(String... args) throws InterruptedException { - new WikipediaReader(args[0], new EdgeDomain("en.wikipedia.org"), wikipediaArticle -> { - - - var doc = Jsoup.parse(wikipediaArticle.body); - String path = wikipediaArticle.url.path.substring("/wiki/".length()); - - if (isIncluded(path)) { - Set seen = new HashSet<>(100); - - for (var atag : doc.getElementsByTag("a")) { - String href = atag.attr("href"); - - if (href.contains("#")) { - href = href.substring(0, href.indexOf('#')); - } - - if (isIncluded(href) && href.length() > 2 && seen.add(href)) { - System.out.println(path + "\t" + href); - } - } - } - - }).join(); - } - - private static boolean isIncluded(String href) { - return !href.contains(":") - && !href.contains("/") - && !href.contains("%") - && !href.startsWith("#"); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/ApiMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/ApiMain.java deleted file mode 100644 index 5690b807..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/ApiMain.java +++ /dev/null @@ -1,27 +0,0 @@ -package nu.marginalia.wmsa.api; - -import com.google.inject.Guice; -import com.google.inject.Inject; -import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.configuration.server.Initialization; - -public class ApiMain extends MainClass { - - @Inject - public ApiMain(ApiService service) { - } - - public static void main(String... args) { - init(ServiceDescriptor.API, args); - - Injector injector = Guice.createInjector( - new DatabaseModule(), - new ConfigurationModule()); - injector.getInstance(ApiMain.class); - injector.getInstance(Initialization.class).setReady(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java deleted file mode 100644 index 4c301ef1..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java +++ /dev/null @@ -1,55 +0,0 @@ -package nu.marginalia.wmsa.api.model; - -import lombok.AllArgsConstructor; -import lombok.Getter; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; -import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; - -import java.util.ArrayList; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -@AllArgsConstructor @Getter -public class ApiSearchResult { - public String url; - public String title; - public String description; - public double quality; - - public List> details = new ArrayList<>(); - - public ApiSearchResult(EdgeUrlDetails url) { - this.url = url.url.toString(); - this.title = url.getTitle(); - this.description = url.getDescription(); - - this.quality = sanitizeNaN(url.getTermScore(), -100); - - if (url.resultItem != null) { - var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(EdgeSearchResultKeywordScore::set)); - - outer: - for (var entries : bySet.values()) { - List lst = new ArrayList<>(); - for (var entry : entries) { - var metadata = new EdgePageWordMetadata(entry.encodedWordMetadata()); - if (metadata.isEmpty()) - continue outer; - - Set flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet()); - lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), metadata.count(), flags)); - } - details.add(lst); - } - } - } - - private double sanitizeNaN(double value, double alternative) { - if (!Double.isFinite(value)) { - return alternative; - } - return value; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/MainClass.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/MainClass.java deleted file mode 100644 index f52d3b87..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/MainClass.java +++ /dev/null @@ -1,56 +0,0 @@ -package nu.marginalia.wmsa.configuration; - -import io.prometheus.client.hotspot.DefaultExports; -import io.reactivex.rxjava3.exceptions.UndeliverableException; -import io.reactivex.rxjava3.plugins.RxJavaPlugins; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.client.exception.NetworkException; -import org.mariadb.jdbc.Driver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.net.SocketTimeoutException; -import java.net.UnknownHostException; -import java.util.Arrays; - -public abstract class MainClass { - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public MainClass() { - - RxJavaPlugins.setErrorHandler(ex -> { - if (ex instanceof UndeliverableException) { - ex = ex.getCause(); - } - - if (ex instanceof SocketTimeoutException) { - logger.warn("SocketTimeoutException"); - } - else if (ex instanceof UnknownHostException) { - logger.warn("UnknownHostException"); - } - else if (ex instanceof NetworkException) { - logger.warn("NetworkException", ex); - } - else { - logger.error("Uncaught exception", ex); - } - }); - - } - - @SneakyThrows - protected static void init(ServiceDescriptor service, String... args) { - System.setProperty("log4j2.isThreadContextMapInheritable", "true"); - System.setProperty("isThreadContextMapInheritable", "true"); - System.setProperty("service-name", service.name); - - org.mariadb.jdbc.Driver driver = new Driver(); - - if (Arrays.asList(args).contains("go-no-go")) { - System.setProperty("go-no-go", "true"); - } - DefaultExports.initialize(); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java deleted file mode 100644 index 6acbaea6..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java +++ /dev/null @@ -1,107 +0,0 @@ -package nu.marginalia.wmsa.configuration; - -import nu.marginalia.wmsa.api.ApiMain; -import nu.marginalia.wmsa.auth.AuthMain; -import nu.marginalia.wmsa.configuration.command.*; -import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain; -import nu.marginalia.wmsa.edge.dating.DatingMain; -import nu.marginalia.wmsa.edge.explorer.ExplorerMain; -import nu.marginalia.wmsa.edge.index.EdgeIndexMain; -import nu.marginalia.wmsa.edge.search.EdgeSearchMain; -import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain; -import nu.marginalia.wmsa.memex.MemexMain; -import nu.marginalia.wmsa.podcasts.PodcastScraperMain; -import nu.marginalia.wmsa.renderer.RendererMain; -import nu.marginalia.wmsa.resource_store.ResourceStoreMain; -import org.apache.logging.log4j.core.lookup.MainMapLookup; - -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public enum ServiceDescriptor { - RESOURCE_STORE("resource-store", 5000, ResourceStoreMain.class), - RENDERER("renderer", 5002, RendererMain.class), - AUTH("auth", 5003, AuthMain.class), - API("api", 5004, ApiMain.class), - - PODCST_SCRAPER("podcast-scraper", 5013, PodcastScraperMain.class), - - EDGE_INDEX("edge-index", 5021, EdgeIndexMain.class), - EDGE_SEARCH("edge-search", 5023, EdgeSearchMain.class), - EDGE_ASSISTANT("edge-assistant", 5025, EdgeAssistantMain.class), - - MEMEX("memex", 5030, MemexMain.class), - - ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class), - - DATING("dating", 5070, DatingMain.class), - EXPLORER("explorer", 5071, ExplorerMain.class), - - TEST_1("test-1", 0, null), - TEST_2("test-2", 0, null); - - private static HostsFile hostsFile; - public synchronized String getHost() { - if (hostsFile == null) { - hostsFile = WmsaHome.getHostsFile(); - } - return hostsFile.getHost(this); - } - - public static ServiceDescriptor byName(String name) { - for (var v : values()) { - if (v.name.equals(name)) { - return v; - } - } - throw new IllegalArgumentException("Invalid ServiceDescriptor " + name); - } - public final String name; - public final Class mainClass; - public final int port; - - ServiceDescriptor(String name, int port, Class mainClass) { - this.name = name; - this.port = port; - this.mainClass = mainClass; - } - - public String toString() { - return name; - } - - public String describeService() { - return String.format("%s %s", name, mainClass.getName()); - } - - public static void main(String... args) { - MainMapLookup.setMainArguments(args); - - Map functions = Stream.of( - new ListCommand(), - new StartCommand(), - new ConvertCommand(), - new CrawlCommand(), - new LoadCommand(), - new ReindexCommand(), - new VersionCommand(), - new IndexDataDumpCommand() - ).collect(Collectors.toMap(c -> c.name, c -> c)); - - if(args.length > 0) { - functions.getOrDefault(args[0], new Command("") { - @Override - public void execute(String... args) { - System.err.println("Unknown command"); - System.exit(1); - } - }).execute(args); - } - else { - System.err.println("Usage: " + String.join("|", functions.keySet())); - System.exit(1); - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/Command.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/Command.java deleted file mode 100644 index 5267045f..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/Command.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.wmsa.configuration.command; - -import nu.marginalia.wmsa.configuration.ServiceDescriptor; - -import java.util.Arrays; -import java.util.Objects; - -public abstract class Command { - public final String name; - - protected Command(String name) { - this.name = name; - } - - public abstract void execute(String... args); - - static ServiceDescriptor getKind(String arg) { - - try { - return Arrays.stream(ServiceDescriptor.values()) - .filter(sd -> Objects.equals(arg, sd.name)) - .findFirst() - .orElseThrow(IllegalArgumentException::new) - ; - } catch (IllegalArgumentException ex) { - System.err.println("Unknown service '" + arg + "'"); - System.exit(1); - } - return null; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/ConvertCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/ConvertCommand.java deleted file mode 100644 index 7cecbc25..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/ConvertCommand.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.wmsa.configuration.command; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.converting.ConverterMain; - -import java.util.Arrays; - -public class ConvertCommand extends Command { - public ConvertCommand() { - super("convert"); - } - - @Override - @SneakyThrows - public void execute(String... args) { - if (args.length < 2) { - System.err.println("Usage: convert plan.yaml"); - System.exit(255); - } - - String[] args2 = Arrays.copyOfRange(args, 1, args.length); - ConverterMain.main(args2); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/CrawlCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/CrawlCommand.java deleted file mode 100644 index 07c291bb..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/CrawlCommand.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.wmsa.configuration.command; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.crawling.CrawlerMain; - -import java.util.Arrays; - -public class CrawlCommand extends Command { - public CrawlCommand() { - super("crawl"); - } - - @Override - @SneakyThrows - public void execute(String... args) { - if (args.length < 2) { - System.err.println("Usage: crawl plan.yaml"); - System.exit(255); - } - - String[] args2 = Arrays.copyOfRange(args, 1, args.length); - CrawlerMain.main(args2); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/IndexDataDumpCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/IndexDataDumpCommand.java deleted file mode 100644 index 75ea02c7..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/IndexDataDumpCommand.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.wmsa.configuration.command; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.tools.IndexJournalDumpTool; - -import java.util.Arrays; - -public class IndexDataDumpCommand extends Command { - public IndexDataDumpCommand() { - super("index-dump"); - } - - @SneakyThrows - @Override - public void execute(String... args) { - if (args.length < 1) { - System.err.println("Usage: index-dump [sub-command] index.dat"); - System.exit(255); - } - - String[] args2 = Arrays.copyOfRange(args, 1, args.length); - IndexJournalDumpTool.main(args2); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/ListCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/ListCommand.java deleted file mode 100644 index 0bd2c3eb..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/ListCommand.java +++ /dev/null @@ -1,23 +0,0 @@ -package nu.marginalia.wmsa.configuration.command; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; - -import java.util.Arrays; -import java.util.Objects; - -public class ListCommand extends Command { - public ListCommand() { - super("list"); - } - - @Override - @SneakyThrows - public void execute(String... args) { - Arrays.stream(ServiceDescriptor.values()) - .filter(sd -> Objects.nonNull(sd.mainClass)) - .map(ServiceDescriptor::describeService) - .forEach(System.out::println); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/LoadCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/LoadCommand.java deleted file mode 100644 index 9b1e7120..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/LoadCommand.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.wmsa.configuration.command; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.converting.LoaderMain; - -import java.util.Arrays; - -public class LoadCommand extends Command { - public LoadCommand() { - super("load"); - } - - @Override - @SneakyThrows - public void execute(String... args) { - if (args.length < 2) { - System.err.println("Usage: load plan.yaml"); - System.exit(255); - } - - String[] args2 = Arrays.copyOfRange(args, 1, args.length); - LoaderMain.main(args2); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/ReindexCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/ReindexCommand.java deleted file mode 100644 index 18b3025e..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/ReindexCommand.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.wmsa.configuration.command; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.converting.ReindexTriggerMain; - -import java.util.Arrays; - -public class ReindexCommand extends Command { - public ReindexCommand() { - super("reindex"); - } - - @Override - @SneakyThrows - public void execute(String... args) { - if (args.length < 2) { - System.err.println("Usage: reindex host"); - System.exit(255); - } - - String[] args2 = Arrays.copyOfRange(args, 1, args.length); - ReindexTriggerMain.main(args2); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/StartCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/StartCommand.java deleted file mode 100644 index cd98a375..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/StartCommand.java +++ /dev/null @@ -1,30 +0,0 @@ -package nu.marginalia.wmsa.configuration.command; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; - -import java.util.Arrays; - -public class StartCommand extends Command { - public StartCommand() { - super("start"); - } - - @Override - @SneakyThrows - public void execute(String... args) { - if (args.length < 2) { - System.err.println("Usage: start service-descriptor"); - System.err.println(); - System.err.println("Available services:"); - System.err.println(); - for (var d : ServiceDescriptor.values()) { - System.err.println("\t"+d.name); - } - System.exit(255); - } - var mainMethod = getKind(args[1]).mainClass.getMethod("main", String[].class); - String[] args2 = Arrays.copyOfRange(args, 2, args.length); - mainMethod.invoke(null, (Object) args2); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/VersionCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/VersionCommand.java deleted file mode 100644 index 179a0300..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/VersionCommand.java +++ /dev/null @@ -1,20 +0,0 @@ -package nu.marginalia.wmsa.configuration.command; - -import lombok.SneakyThrows; - -public class VersionCommand extends Command { - public VersionCommand() { - super("version"); - } - - @Override - @SneakyThrows - public void execute(String... args) { - try (var str = ClassLoader.getSystemResourceAsStream("_version.txt")) { - if (null == str) { - System.err.println("Bad jar, missing _version.txt"); - return; - } - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/ConfigurationModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/ConfigurationModule.java deleted file mode 100644 index 2727d31a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/ConfigurationModule.java +++ /dev/null @@ -1,27 +0,0 @@ -package nu.marginalia.wmsa.configuration.module; - -import com.google.inject.AbstractModule; -import com.google.inject.Provides; -import com.google.inject.name.Named; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; - -import java.util.Objects; - -import static com.google.inject.name.Names.named; - -public class ConfigurationModule extends AbstractModule { - private static final String SERVICE_NAME = System.getProperty("service-name"); - - public void configure() { - bind(String.class).annotatedWith(named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME)); - bind(String.class).annotatedWith(named("service-host")).toInstance(System.getProperty("service-host", "127.0.0.1")); - bind(Integer.class).annotatedWith(named("service-port")).toInstance(ServiceDescriptor.byName(System.getProperty("service-name")).port); - } - - @Provides - @Named("metrics-server-port") - public Integer provideMetricsServerPort(@Named("service-port") Integer servicePort) { - return servicePort + 1000; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Context.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Context.java deleted file mode 100644 index f7578b45..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/Context.java +++ /dev/null @@ -1,139 +0,0 @@ -package nu.marginalia.wmsa.configuration.server; - -import io.reactivex.rxjava3.schedulers.Schedulers; -import org.apache.logging.log4j.ThreadContext; -import spark.Request; - -import java.util.*; -import java.util.concurrent.TimeUnit; - -public class Context { - public static final String CONTEXT_HEADER = "X-Context"; - public static final String SESSION_HEADER = "Cookie"; - public static final String PUBLIC_HEADER = "X-Public"; - private static final Random random; - - private static volatile byte[] seed = new byte[12]; - - private static byte[] generateSalt() { - byte[] oldHash = seed; - - int hash1 = Long.hashCode(random.nextLong()); - int hash2 = Objects.hash(System.currentTimeMillis()); - int hash3 = Arrays.hashCode(oldHash); - - return new byte[]{ - (byte) (hash1 & 0xFF), - (byte) (hash1 >>> 8 & 0xFF), - (byte) (hash1 >>> 16 & 0xFF), - (byte) (hash1 >>> 24 & 0xFF), - (byte) (hash2 & 0xFF), - (byte) (hash2 >>> 8 & 0xFF), - (byte) (hash2 >>> 16 & 0xFF), - (byte) (hash2 >>> 24 & 0xFF), - (byte) (hash3 & 0xFF), - (byte) (hash3 >>> 8 & 0xFF), - (byte) (hash3 >>> 16 & 0xFF), - (byte) (hash3 >>> 24 & 0xFF) - }; - } - - static { - random = new Random(); - for (int i = 0; i < 1_000_000; i++) { - random.nextLong(); - } - random.nextBytes(seed); - - updateSeed(); - } - - private static void updateSeed() { - seed = generateSalt(); - - Schedulers.computation().scheduleDirect(Context::updateSeed, - 60*5000 + (int)(1000*60*10*Math.random()), - TimeUnit.MILLISECONDS); - } - - private final String id; - private final String session; - private boolean treatAsPublic; - - private Context(String id, String session) { - this.id = id; - this.session = session; - } - - public Context treatAsPublic() { - this.treatAsPublic = true; - return this; - } - - public static Context internal() { - return new Context(UUID.randomUUID().toString(), null); - } - public static Context internal(String hwat) { - return new Context(hwat, null); - } - - public static Context fromRequest(Request request) { - - if (Boolean.getBoolean("unit-test")) { - return Context.internal(); - } - - final var ctxHeader = hashPublicIp(request.headers(CONTEXT_HEADER)); - final var sessHeader = request.headers(SESSION_HEADER); - - ThreadContext.put("context", ctxHeader+"-"+sessHeader); - ThreadContext.put("outbound-request", "none"); - - return new Context(ctxHeader, sessHeader); - } - - private static String hashPublicIp(String header) { - - if (header != null && header.contains("-")) { - - byte[] hashData = Arrays.copyOf(seed, seed.length+4); - int hashi = Objects.hash(header.split("-", 2)[0]); - - for (int i = 0; i < 4; i++) { - hashData[seed.length] = (byte)(hashi & 0xFF); - hashData[seed.length+1] = (byte)(hashi>>>8 & 0xFF); - hashData[seed.length+2] = (byte)(hashi>>>16 & 0xFF); - hashData[seed.length+3] = (byte)(hashi>>>24 & 0xFF); - } - - return String.format("#%x", Arrays.hashCode(hashData)); - } - else { - return header; - } - } - - public okhttp3.Request.Builder paint(okhttp3.Request.Builder requestBuilder) { - requestBuilder.addHeader(CONTEXT_HEADER, id); - - if (session != null) { - requestBuilder.addHeader(SESSION_HEADER, session); - } - - if (treatAsPublic) { - requestBuilder.header(PUBLIC_HEADER, "1"); - } - - return requestBuilder; - } - - public Optional getIpHash() { - - if (id.startsWith("#")) { - return Optional.of(id); - } - - return Optional.empty(); - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantMain.java deleted file mode 100644 index db8dcd28..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantMain.java +++ /dev/null @@ -1,33 +0,0 @@ -package nu.marginalia.wmsa.edge.assistant; - -import com.google.inject.Guice; -import com.google.inject.Inject; -import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.configuration.server.Initialization; - -public class EdgeAssistantMain extends MainClass { - private final EdgeAssistantService service; - - @Inject - public EdgeAssistantMain(EdgeAssistantService service) { - this.service = service; - } - - public static void main(String... args) { - init(ServiceDescriptor.EDGE_ASSISTANT, args); - - Injector injector = Guice.createInjector( - new EdgeAssistantModule(), - new ConfigurationModule(), - new DatabaseModule() - ); - - injector.getInstance(EdgeAssistantMain.class); - injector.getInstance(Initialization.class).setReady(); - - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java deleted file mode 100644 index 4d87ec96..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java +++ /dev/null @@ -1,218 +0,0 @@ -package nu.marginalia.wmsa.edge.assistant.dict; - -import ca.rmen.porterstemmer.PorterStemmer; -import gnu.trove.map.hash.TLongIntHashMap; -import gnu.trove.set.hash.TLongHashSet; -import nu.marginalia.util.language.LanguageFilter; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter; -import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; -import javax.inject.Inject; -import javax.inject.Singleton; -import java.io.*; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -@Singleton -public class TermFrequencyDict { - - private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0); - - private final Logger logger = LoggerFactory.getLogger(getClass()); - private static final Pattern separator = Pattern.compile("[_ ]+"); - private static final PorterStemmer ps = new PorterStemmer(); - - private static final long DOC_COUNT_KEY = ~0L; - private static long fileSize(Path p) throws IOException { - return Files.size(p); - } - - @Inject - public TermFrequencyDict(@Nullable LanguageModels models) { - if (models == null) { - return; - } - - if (models.termFrequencies != null) { - - try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.termFrequencies.toFile())))) { - - wordRates.ensureCapacity((int)(fileSize(models.termFrequencies)/16)); - - for (;;) { - wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong()); - } - } catch (EOFException eof) { - // ok - } catch (IOException e) { - logger.error("IO Exception reading " + models.termFrequencies, e); - } - } - - logger.info("Read {} N-grams frequencies", wordRates.size()); - } - - - public int docCount() { - int cnt = wordRates.get(DOC_COUNT_KEY); - - if (cnt == 0) { - cnt = 11820118; // legacy - } - return cnt; - } - - public static void main(String... args) throws IOException, InterruptedException { - if (args.length != 2) { - System.err.println("Expected arguments: plan.yaml out-file"); - } - String outFile = args[1]; - - var plan = new CrawlPlanLoader().load(Path.of(args[0])); - - ThreadLocal se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels())); - LanguageFilter lf = new LanguageFilter(); - - TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1); - - ForkJoinPool fjp = new ForkJoinPool(24); - AtomicInteger docCount = new AtomicInteger(); - - for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine - - if (domain.doc == null) - continue; - - fjp.execute(() -> { - - TLongHashSet words = new TLongHashSet(10_000); - - for (var doc : domain.doc) { - - if (doc.documentBody == null) - continue; - docCount.incrementAndGet(); - - Document parsed = Jsoup.parse(doc.documentBody.decode()); - parsed.body().filter(new DomPruningFilter(0.5)); - - DocumentLanguageData dld = se.get().extractSentences(parsed); - - if (lf.dictionaryAgreement(dld) < 0.1) { - return; - } - - for (var sent : dld.sentences) { - for (var word : sent) { - words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); - } - } - - synchronized (counts) { - words.forEach(w -> { - counts.adjustOrPutValue(w, 1, 1); - return true; - }); - } - - words.clear(); - } - - System.out.println(domain.domain + "\t" + counts.size()); - }); - - - } - - fjp.shutdown(); - fjp.awaitTermination(10, TimeUnit.DAYS); - - try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) { - synchronized (counts) { - counts.put(DOC_COUNT_KEY, docCount.get()); - - counts.forEachEntry((hash, cnt) -> { - try { - dos.writeLong(hash); - dos.writeLong(cnt); - } catch (IOException e) { - throw new RuntimeException(e); - } - return true; - }); - } - } - - System.out.println(docCount.get()); - } - - public static long getStringHash(String s) { - String[] strings = separator.split(s); - if (s.length() > 1) { - byte[][] parts = new byte[strings.length][]; - for (int i = 0; i < parts.length; i++) { - parts[i] = ps.stemWord(strings[i]).getBytes(); - } - return longHash(parts); - } - else { - return longHash(s.getBytes()); - } - } - public long getTermFreqHash(long hash) { - return wordRates.get(hash); - } - public long getTermFreq(String s) { - return wordRates.get(getStringHash(s)); - } - public long getTermFreqStemmed(String s) { - return wordRates.get(longHash(s.getBytes())); - } - - public static String getStemmedString(String s) { - String[] strings = separator.split(s); - if (s.length() > 1) { - return Arrays.stream(strings).map(ps::stemWord).collect(Collectors.joining("_")); - } - else { - return s; - } - - } - - public static long longHash(byte[]... bytesSets) { - if (bytesSets == null || bytesSets.length == 0) - return 0; - - // https://cp-algorithms.com/string/string-hashing.html - int p = 127; - long m = (1L<<61)-1; - long p_power = 1; - long hash_val = 0; - - for (byte[] bytes: bytesSets) { - for (byte element : bytes) { - hash_val = (hash_val + (element + 1) * p_power) % m; - p_power = (p_power * p) % m; - } - } - return hash_val; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiArticles.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiArticles.java deleted file mode 100644 index 29cf187e..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiArticles.java +++ /dev/null @@ -1,23 +0,0 @@ -package nu.marginalia.wmsa.edge.assistant.dict; - -import lombok.Getter; -import lombok.ToString; - -import java.util.List; - -@ToString @Getter -public class WikiArticles { - public List entries; - - public WikiArticles(String... args) { - entries = List.of(args); - } - public String getPage() { - if (entries.isEmpty()) { - return null; - } - else { - return entries.get(0); - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleaner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleaner.java deleted file mode 100644 index cc70f441..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleaner.java +++ /dev/null @@ -1,386 +0,0 @@ -package nu.marginalia.wmsa.edge.assistant.dict; - -import lombok.SneakyThrows; -import net.sourceforge.jeuclid.MathMLParserSupport; -import net.sourceforge.jeuclid.context.Display; -import net.sourceforge.jeuclid.context.LayoutContextImpl; -import net.sourceforge.jeuclid.context.Parameter; -import net.sourceforge.jeuclid.font.FontFactory; -import org.apache.commons.lang3.tuple.Pair; -import org.jetbrains.annotations.NotNull; -import org.jsoup.Jsoup; -import org.jsoup.nodes.*; -import org.jsoup.select.Elements; -import org.jsoup.select.NodeFilter; - -import java.awt.*; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.util.List; -import java.util.*; -import java.util.stream.Collectors; - - -public class WikiCleaner { - - static { - try (var font = ClassLoader.getSystemResourceAsStream("fonts/LM-regular.ttf")) { - FontFactory.getInstance().registerFont(Font.TRUETYPE_FONT, font); - } catch (IOException | FontFormatException e) { - e.printStackTrace(); - } - try (var font = ClassLoader.getSystemResourceAsStream("fonts/STIXTwoMath-Regular.ttf")) { - FontFactory.getInstance().registerFont(Font.TRUETYPE_FONT, font); - } catch (IOException | FontFormatException e) { - e.printStackTrace(); - } - } - public String cleanWikiJunk(String url, String html) { - return cleanWikiJunk(url, Jsoup.parse(html)); - } - - public List extractLinkWords(String data) { - var doc = Jsoup.parse(data); - return getWikiPageLinkText(doc); - } - - public String cleanWikiJunk(String url, Document doc) { - - if (doc.getElementById("content") == null) { - return null; - } - List> disambig = getDisambiguationLinks(doc); - List> topLinks = getWikiPageLinks(doc); - - removeTag(doc, "script", "object", "embed", "audio", "style", "noscript", "link", "meta", "img"); - doc.getElementsByClass("mwe-math-element").forEach(this::convertMathTag); - removeByClass(doc, "infobox", "collapsible", "navbar", "printfooter", - "mw-editsection", "thumb", "sidebar", "navbox", "mw-jump-link", - "vertical-navbox"); - removeByClass(doc, "mw-indicators", "noprint", "sistersitebox"); - removeIds(doc, "coordinates", "mw-page-base", "mw-head-base", "site-notice", "contentSub", "contentSub2"); - - doc.getElementsByAttributeValue("role", "presentation").remove(); - - doc.getElementsByTag("a").forEach(atag -> { - var href = atag.attr("href"); - var parent = atag.parent(); - - if ("li".equals(parent.tagName())) { - atag.removeAttr("title"); - if (href.startsWith("http://")) { - atag.addClass("extern-link"); - atag.attr("rel", "nofollow"); - return; - } - } - else { - atag.replaceWith(new TextNode(atag.text())); - } - }); - - doc.getElementsByTag("cite").tagName("span"); - - removeIds(doc, "toc", "catlinks", "Notes", "mw-navigation", "mw-data-after-content", "jump-to-nav"); - removeByClass(doc, "mw-references-wrap", "references", "reference", "siteSub", "refbegin"); - - // doc.getElementById("mw-content-text").insertChildren(0, doc.getElementById("firstHeading")); - doc.getElementById("content").tagName("article"); - doc.getAllElements().forEach(elem -> { - if (elem.parent() != null - && "summary".equals(elem.parent().tagName())) - { - elem.parent().replaceWith(elem); - } - }); - - doc.getElementsByTag("span").forEach(elem -> { - if ("pre".equals(elem.parent().tagName())) { - if (elem.hasClass("linenos")) { - elem.replaceWith(new TextNode(String.format("%-4s", elem.text()))); - } - else { - elem.replaceWith(new TextNode(elem.text())); - } - } - else { - elem.replaceWith(new TextNode(" " + elem.text() + " ")); - } - }); - - doc.getElementsByTag("details").forEach(deets -> { - if (deets.children().size() == 1) { - deets.replaceWith(deets.children().first()); - } - else { - deets.tagName("div"); - } - }); - - removeEmptyTags(doc, "li"); - removeEmptyTags(doc, "ul"); - removeEmptyTags(doc, "div"); - - doc.getElementsByTag("p").forEach(elem -> { - if ("blockquote".equals(elem.parent().tagName())) { - elem.replaceWith(new TextNode(elem.text())); - } - }); - - removeEmptyTags(doc, "p"); - - doc.getElementsByTag("h4").forEach(elem -> { - var next = elem.nextElementSibling(); - if (next == null) { - elem.remove(); - return; - } - String nextTagName = next.tagName(); - if ("h4".equals(nextTagName) || "h3".equals(nextTagName) || "h2".equals(nextTagName)) { - elem.remove(); - } - }); - - - doc.getElementsByTag("h3").forEach(elem -> { - var next = elem.nextElementSibling(); - if (next == null) { - elem.remove(); - return; - } - String nextTagName = next.tagName(); - if ("h3".equals(nextTagName) || "h2".equals(nextTagName)) { - elem.remove(); - } - }); - - doc.getElementsByTag("h2").forEach(elem -> { - var next = elem.nextElementSibling(); - if (next == null) { - elem.remove(); - return; - } - if ("h2".equals(next.tagName())) { - elem.remove(); - } - }); - doc.getElementsByTag("footer").remove(); - doc.getElementsByTag("table").forEach(table -> { - table.attr("border", "1"); - }); - doc.getElementsByTag("table").forEach(table -> { - if ("right".equals(table.attr("align"))) { - table.remove(); - } - }); - - doc.getElementsByTag("head").append(""); - doc.getElementsByTag("head").append(""); - doc.getElementsByTag("head").append(""); - doc.getElementsByTag("head").append(""); - doc.getElementsByTag("head").append(""); - doc.getElementsByTag("head").append(""); - - if (!topLinks.isEmpty()) { - doc.getElementsByTag("article").append("

Index of References

"); - } - - if (!disambig.isEmpty()) { - doc.getElementsByTag("h1").first().nextElementSibling().prepend("
See Also" + - disambig.stream().map(href -> ""+href.getValue()+"").collect(Collectors.joining("
")) - + ""); - } - - doc.getElementsByTag("article").first().parent().prepend("
"); - doc.getElementsByTag("article").first().parent().append(""); - - doc.getElementsByTag("div").forEach(tag -> { - if (tag.text().startsWith("This article is issued from Wikipedia")) { - tag.remove(); // we have our own - } - }); - doc.getAllElements().forEach(elem -> { - var classes = elem.classNames().stream().filter(this::isWikiClass).toList(); - classes.forEach(elem::removeClass); - elem.removeAttr("lang"); - elem.removeAttr("dir"); - elem.removeAttr("id"); - elem.removeAttr("role"); - elem.removeAttr("style"); - elem.removeAttr("tabindex"); - elem.removeAttr("aria-haspopup"); - elem.removeAttr("data-section-id"); - elem.removeAttr("aria-expanded"); - elem.removeAttr("aria-pressed"); - elem.removeAttr("open"); - elem.removeAttr("data-level"); - }); - - marginifyHeaders(doc); - - - doc.filter(new NodeFilter() { - @Override - public FilterResult head(Node node, int depth) { - if (node instanceof Comment) { - return FilterResult.REMOVE; - } - return FilterResult.CONTINUE; - } - - @Override - public FilterResult tail(Node node, int depth) { - if (node instanceof Comment) { - return FilterResult.REMOVE; - } - return FilterResult.CONTINUE; - } - }); - return doc.html(); - } - - @SneakyThrows - private void convertMathTag(Element math) { - - try { - var formula = math.getElementsByTag("math"); - var converter = net.sourceforge.jeuclid.converter.Converter.getInstance(); - var sos = new ByteArrayOutputStream(); - var alt = Optional.of(formula.attr("alttext")).filter(s -> !s.isBlank()) - .orElseGet(() -> math.getElementsByTag("annotation").text()); - - var layoutContext = new LayoutContextImpl(LayoutContextImpl.getDefaultLayoutContext()); - - String parentTag = math.parent().tag().getName(); - boolean topLevel = "dd".equals(parentTag) || "div".equals(parentTag) - || (math.nextElementSibling() == null && math.previousElementSibling() == null); - - int mathSize = 16; - if (topLevel) - mathSize = 24; - if ("h1".equals(parentTag)) { - mathSize = 28; - } - if ("h2".equals(parentTag)) { - mathSize = 24; - } - if ("h3".equals(parentTag)) { - mathSize = 22; - } - layoutContext.setParameter(Parameter.MATHSIZE, mathSize); - - layoutContext.setParameter(Parameter.ANTIALIAS, true); - layoutContext.setParameter(Parameter.SCRIPTMINSIZE, 8); - layoutContext.setParameter(Parameter.FONTS_SERIF, "STIX Two Math"); - layoutContext.setParameter(Parameter.FONTS_SCRIPT, "STIX Two Math"); - layoutContext.setParameter(Parameter.DISPLAY, topLevel ? Display.BLOCK : Display.INLINE); - - converter.convert(MathMLParserSupport.parseString( - formula.html().replace(" ", " ")), sos, - "image/png", - layoutContext).toString(); - - math.tagName("img") - .text("") - .attr("src", "data:image/png;base64," + Base64.getEncoder().encodeToString(sos.toByteArray())) - .attr("alt", alt); - - } - catch (Exception ex) { - ex.printStackTrace(); - } - } - - private void removeEmptyTags(Document doc, String tag) { - doc.getElementsByTag(tag).forEach(elem -> { - if (elem.text().isBlank() && elem.getElementsByTag("img").isEmpty()) { - elem.replaceWith(new TextNode(" ")); - } - - }); - } - - @NotNull - private List> getWikiPageLinks(Document doc) { - List> topLinks = new ArrayList<>(); - doc.select("p a").forEach(atag -> { - String href = atag.attr("href"); - - if (!href.isBlank() - && !href.contains(":") - && !href.startsWith("#") - ) { - topLinks.add(Pair.of(href, atag.attr("title"))); - } - }); - return topLinks; - } - - - @NotNull - private List getWikiPageLinkText(Document doc) { - List topLinks = new ArrayList<>(); - - doc.select("p a,h1,h2,h3,h4,i,em,strong,b").forEach(e -> topLinks.add(e.text())); - - return topLinks; - } - - @NotNull - private List> getDisambiguationLinks(Document doc) { - List> disambig = new ArrayList<>(); - - for (var note: doc.getElementsByClass("hatnote")) { - for (var atag : note.getElementsByTag("a")) { - String href = atag.attr("href"); - if (atag.hasClass("mw-disambig") && !href.isBlank()) { - disambig.add(Pair.of(href, atag.attr("title"))); - } - } - } - doc.getElementsByClass("hatnote").remove(); - - return disambig; - } - - private void removeTag(Document doc, String... tags) { - for (String tag : tags) { - doc.getElementsByTag(tag).remove(); - } - } - private void removeByClass(Document doc, String... classes) { - for (String clas: classes) { - doc.getElementsByClass(clas).remove(); - } - } - private void removeIds(Document doc, String... ids) { - Arrays.stream(ids) - .map(doc::getElementById) - .filter(Objects::nonNull) - .forEach(Element::remove); - } - - private void marginifyHeaders(Document doc) { - Elements headers = doc.getElementsByTag("h4"); - if (headers.size() == 0) { - headers = doc.getElementsByTag("h3"); - } - headers.addClass("margin-note"); - } - - boolean isWikiClass(String clazz) { - if ("verb".equals(clazz)) { - return false; - } - if ("extern-link".equals(clazz)) { - return false; - } - if ("margin-note".equals(clazz)) { - return false; - } - return true; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiSearchResult.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiSearchResult.java deleted file mode 100644 index f3e0f7ac..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/WikiSearchResult.java +++ /dev/null @@ -1,55 +0,0 @@ -package nu.marginalia.wmsa.edge.assistant.dict; - -import lombok.AllArgsConstructor; - -import javax.annotation.Nullable; -import java.net.URLEncoder; -import java.nio.charset.StandardCharsets; -import java.util.Optional; - -@AllArgsConstructor -public class WikiSearchResult { - private final String name; - @Nullable - private final String refName; - - public String getName() { - return name.replace('_', ' '); - } - @Nullable - public String getRefName() { - if (refName == null) - return null; - - return refName.replace('_', ' '); - } - - public String getUrl() { - return "https://encyclopedia.marginalia.nu/wiki/" + URLEncoder.encode(getRealName(), StandardCharsets.UTF_8); - } - - public String getRealName() { - return Optional.ofNullable(refName).orElse(name); - } - - public String getInternalName() { - return name; - } - - @Override - public int hashCode() { - return getRealName().hashCode(); - } - - public boolean equals(Object other) { - if (other == this) { - return true; - } - if (other instanceof WikiSearchResult) { - WikiSearchResult r = (WikiSearchResult) other; - return r.getRealName().equals(getRealName()); - } - return false; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java deleted file mode 100644 index f9557c97..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ /dev/null @@ -1,186 +0,0 @@ -package nu.marginalia.wmsa.edge.converting; - -import gnu.trove.set.hash.TIntHashSet; -import nu.marginalia.wmsa.edge.converting.atags.AnchorTextExtractor; -import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; -import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader; -import nu.marginalia.wmsa.edge.crawling.WorkLog; -import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.wmsa.edge.integration.stackoverflow.StackOverflowPostsReader; -import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; -import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Objects; - -public class LinkKeywordExtractorMain { - private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class); - - public static void main(String... args) throws IOException, InterruptedException { - - if (args.length < 2) { - System.err.println("Arguments: [crawl|so|wiki] crawl-plan.yaml [data]"); - System.exit(0); - } - - String command = args[0]; - var plan = new CrawlPlanLoader().load(Path.of(args[1])); - - switch (command) { - case "crawl": getKeywordsFromCrawl(plan); break; - case "so": getKeywordsFromSo(plan, args[2]); break; - case "wiki": getKeywordsFromWiki(plan, args[2]); break; - default: System.err.println("Unrecognized command"); - } - - } - - private static void getKeywordsFromWiki(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException { - - - HashSet crawledDomains = new HashSet<>(); - TIntHashSet crawledUrls = new TIntHashSet(50_000_000); - - logger.info("Loading URLs"); - Files.lines(Path.of("/home/vlofgren/good-urls3.txt")) - .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange")) - .mapToInt(String::hashCode) - .forEach(crawledUrls::add); - - logger.info("Loading input spec"); - - CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), - spec -> { crawledDomains.add(spec.domain); }); - - try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { - AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(domain -> crawledDomains.contains(domain) - && !domain.contains("wiki") - && !domain.contains("isni") - && !domain.contains("wiktionary"), - url -> crawledUrls.contains(url.toString().hashCode()), - output::write); - - new WikipediaReader(arg, new EdgeDomain("invalid.example"), article -> { - anchorTextExtractor.processDocument(article.getUrl().toString(), article.body); - }).join(); - } - catch (IOException ex) { - ex.printStackTrace(); - } - - - - } - - private static void getKeywordsFromSo(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException { - TIntHashSet crawledUrls = new TIntHashSet(50_000_000); - - logger.info("Loading URLs"); - Files.lines(Path.of("/home/vlofgren/good-urls3.txt")) - .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange")) - .mapToInt(String::hashCode) - .forEach(crawledUrls::add); - - logger.info("Loading input spec"); - - HashSet crawledDomains = new HashSet<>(); - CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), - spec -> crawledDomains.add(spec.domain)); - - crawledDomains.remove("jsfiddle.net"); // like 30% of SO's links go here - crawledDomains.remove("jsbin.com"); - crawledDomains.remove("codepad.org"); - - - try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { - AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, - url -> crawledUrls.contains(url.toString().hashCode()), - output::write); - - new StackOverflowPostsReader(arg, new EdgeDomain("invalid.example"), post -> { - anchorTextExtractor.processDocument(post.getUrl().toString(), post.fullBody); - }).join(); - } - catch (IOException ex) { - ex.printStackTrace(); - } - } - - - public static void getKeywordsFromCrawl(EdgeCrawlPlan plan) throws IOException { - - logger.info("Loading input spec"); - - HashSet crawledDomains = new HashSet<>(); - CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), - spec -> crawledDomains.add(spec.domain)); - - List fileNames = new ArrayList<>(); - - logger.info("Replaying crawl log"); - WorkLog.readLog(plan.crawl.getLogFile(), - entry -> fileNames.add(entry.path())); - - try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { - AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, - (url) -> true, - //url -> crawledUrls.contains(url.toString().hashCode()), - output::write); - - logger.info("Reading files"); - for (var fn : fileNames) { - CrawledDomainReader crawledDomainReader = new CrawledDomainReader(); - var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn)); - if (crawledDomain.doc == null) continue; - - System.out.println("# " + crawledDomain.domain); - - for (var doc : crawledDomain.doc) { - if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) { - anchorTextExtractor.processDocument(doc.url, doc.documentBody.decode()); - } - } - } - } - - } - - private static class UrlKeywordTsvWriter implements AutoCloseable { - - private final OutputStream stream; - - UrlKeywordTsvWriter(Path outputFile) throws IOException { - this.stream = new BufferedOutputStream(new FileOutputStream(outputFile.toFile())); - } - - void write(EdgeUrl url, String keyword) { - try { - stream.write(url.toString().getBytes()); - stream.write('\t'); - stream.write(keyword.getBytes()); - stream.write('\n'); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - @Override - public void close() throws IOException { - stream.close(); - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java deleted file mode 100644 index f00c54ee..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java +++ /dev/null @@ -1,110 +0,0 @@ -package nu.marginalia.wmsa.edge.converting; - -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.sql.SQLException; -import java.util.*; -import java.util.function.Consumer; - -public class LinkKeywordLoaderMain { - - public static void main(String... args) { - - Map urlToId = getUrls(); - try (EdgeIndexClient indexClient = new EdgeIndexClient(); - var lines = Files.lines(Path.of(args[0])) - ) { - lines - .map(UrlKeyword::parseLine) - .filter(Objects::nonNull) - .forEach(new Uploader(urlToId, indexClient)); - - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private record UrlKeyword(String url, String keyword) { - public static UrlKeyword parseLine(String line) { - int idx = line.indexOf('\t'); - if (idx > 0) { - return new UrlKeyword(line.substring(0, idx), line.substring(idx+1)); - } - return null; - } - } - - private static class Uploader implements Consumer { - private Map urlToId; - private final EdgeIndexClient indexClient; - - private Uploader(Map urlToId, - EdgeIndexClient indexClient) { - this.urlToId = urlToId; - this.indexClient = indexClient; - } - - String lastLine = null; - Set keywords = new HashSet<>(100); - - @Override - public void accept(UrlKeyword urlKeyword) { - if (urlKeyword == null) return; - - if (lastLine == null) { - lastLine = urlKeyword.url; - keywords.add(urlKeyword.keyword); - } - else if (urlKeyword.url.equals(lastLine)) { - keywords.add(urlKeyword.keyword); - } - else { - Long id = urlToId.get(lastLine); - - if (id != null) { - int urlId = (int)(id & 0xFFFF_FFFFL); - int domainId = (int)(id >>> 32L); - - System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords); - -// indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -// new DocumentKeywords(IndexBlock.Link, keywords.toArray(String[]::new)), 0); - } - - lastLine = urlKeyword.url; - keywords.clear(); - keywords.add(urlKeyword.keyword); - } - } - } - - private static Map getUrls() { - - Map urls = new HashMap<>(100_000); - - try (var ds = new DatabaseModule().provideConnection(); - var conn = ds.getConnection(); - var stmt = conn.createStatement()) - { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery("SELECT URL, ID, DOMAIN_ID FROM EC_URL_VIEW WHERE TITLE IS NOT NULL"); - - while (rsp.next()) { - long val = rsp.getInt(3); - val = (val << 32L) | rsp.getInt(2); - - urls.put(rsp.getString(1), val); - } - - } - catch (SQLException ex) { - throw new RuntimeException(ex); - } - - return urls; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java deleted file mode 100644 index 9d67ae36..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java +++ /dev/null @@ -1,59 +0,0 @@ -package nu.marginalia.wmsa.edge.converting; - -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import okhttp3.MediaType; -import okhttp3.OkHttpClient; -import okhttp3.Request; -import okhttp3.RequestBody; -import okio.BufferedSink; -import org.jetbrains.annotations.Nullable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.net.URL; -import java.nio.charset.Charset; -import java.sql.SQLException; -import java.util.concurrent.TimeUnit; - - -public class ReindexTriggerMain { - private static final Logger logger = LoggerFactory.getLogger(ReindexTriggerMain.class); - - public static void main(String... args) throws IOException, SQLException { - var db = new DatabaseModule(); - var client = new OkHttpClient.Builder() - .connectTimeout(100, TimeUnit.MILLISECONDS) - .readTimeout(15, TimeUnit.MINUTES) - .retryOnConnectionFailure(true) - .followRedirects(true) - .build(); - - logger.info("Updating statistics"); - var updateStatistics = new UpdateDomainStatistics(db.provideConnection()); - updateStatistics.run(); - - var rb = new RequestBody() { - - @Nullable - @Override - public MediaType contentType() { - return MediaType.parse("text/plain"); - } - - @Override - public void writeTo(BufferedSink sink) throws IOException { - sink.writeString("NOOP", Charset.defaultCharset()); - } - }; - - logger.info("Repartitioning"); - client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute(); - logger.info("Reindexing"); - client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex")).build()).execute(); - - } - - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java deleted file mode 100644 index e22f1e1b..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java +++ /dev/null @@ -1,264 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.atags; - -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; -import lombok.SneakyThrows; -import nu.marginalia.util.DenseBitMap; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import org.apache.logging.log4j.util.Strings; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.Objects; -import java.util.Set; -import java.util.function.BiConsumer; -import java.util.function.Predicate; -import java.util.regex.Pattern; - -public class AnchorTextExtractor { - private final Predicate includeDomainPredicate; - private final Predicate includeUrlPredicate; - private final BiConsumer linkKeywordConsumer; - - private final LinkParser linkParser = new LinkParser(); - - private final HashFunction hashFunction = Hashing.murmur3_128(); - - // This bit map is used as a bloom filter to deduplicate url-keyword combinations - // false positives are expected, but that's an acceptable trade-off to not have to deal with - // de-duplicating billions of shuffled (url, word) tuples on limited hardware - private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); - - private final NGramBloomFilter nGramBloomFilter; - private final TermFrequencyDict termFrequencyDict; - - public AnchorTextExtractor(Predicate includeDomainPredicate, - Predicate includeUrlPredicate, - BiConsumer linkKeywordConsumer) throws IOException { - this.includeDomainPredicate = includeDomainPredicate; - this.includeUrlPredicate = includeUrlPredicate; - this.linkKeywordConsumer = linkKeywordConsumer; - - nGramBloomFilter = new NGramBloomFilter(WmsaHome.getLanguageModels()); - termFrequencyDict = new TermFrequencyDict(WmsaHome.getLanguageModels()); - } - - @SneakyThrows - public void processDocument(String docUrl, String documentBody) { - final Document processed = Jsoup.parse(documentBody); - final EdgeUrl documentUrl = new EdgeUrl(docUrl); - - for (var link : processed.getElementsByTag("a")) { - if (link.hasAttr("href")) { - String href = link.attr("href"); - String text = getLinkText(link); - - processAnchor(documentUrl, href, text); - } - } - } - - private final Pattern anchorTextNoise = Pattern.compile("[ \t\n\"()“”]+"); - - private String getLinkText(Element link) { - String text = link.text(); - - if (link.text().isBlank()) { - for (var img: link.getElementsByTag("img")) { - if (img.hasAttr("alt")) { - text = img.attr("alt"); - break; - } - } - } - - return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim(); - } - - Set excludedTerminators = Set.of("a", "for", "of", "in", "with", "but", "as", "by", "on", "to", "at", "-"); - - private void processAnchor(EdgeUrl documentUrl, String href, String text) { - text = trimText(text); - - if (!isInterestingAnchorText(text)) { - return; - } - - var optLinkUrl = linkParser.parseLink(documentUrl, href); - if (optLinkUrl.isEmpty()) return; - - var linkUrl = optLinkUrl.get(); - - if (!isInterestingAnchorLink(linkUrl)) { - return; - } - - if (Objects.equals(domainHash(linkUrl), domainHash(documentUrl))) { - return; - } - - String[] wordParts = anchorTextNoise.split(text.toLowerCase()); - - if (wordParts.length > 1) { - String word = Strings.join(Arrays.asList(wordParts), '_'); - - addKeywordIfExistsInTermFreqDictionary(linkUrl, word); - - if (word.contains(".")) { - addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word)); - } - - if (wordParts.length > 2) { - for (int i = 1; i < wordParts.length; i++) { - if (excludedTerminators.contains(wordParts[i])) continue; - if (excludedTerminators.contains(wordParts[i-1])) continue; - - word = wordParts[i-1] + "_" + wordParts[i]; - addKeywordIfExistsInTermFreqDictionary(linkUrl, word); - - if (word.contains(".")) { - addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word)); - } - } - } - - if (wordParts.length > 3) { - for (int i = 2; i < wordParts.length; i++) { - if (excludedTerminators.contains(wordParts[i])) continue; - if (excludedTerminators.contains(wordParts[i-2])) continue; - - word = wordParts[i-2] + "_" + wordParts[i-1] + "_" + wordParts[i]; - - addKeywordIfExistsInTermFreqDictionary(linkUrl, word); - - if (word.contains(".")) { - word = removePeriods(word); - addKeywordIfExistsInTermFreqDictionary(linkUrl, removePeriods(word)); - } - } - } - - } - - for (String word: wordParts) { - if (!WordPatterns.isStopWord(word) - && WordPatterns.filter(word) - && isNewKeywordForLink(word, linkUrl.toString()) - ) { - linkKeywordConsumer.accept(linkUrl, word); - } - } - - for (String word: wordParts) { - if (word.length() > 2 && word.endsWith("'s")) { - word = word.substring(0, word.length()-2); - } - - if (!WordPatterns.isStopWord(word) - && WordPatterns.filter(word) - && isNewKeywordForLink(word, linkUrl.toString()) - ) { - linkKeywordConsumer.accept(linkUrl, word); - } - } - } - - private void addKeywordIfExistsInTermFreqDictionary(EdgeUrl linkUrl, String word) { - if (termFrequencyDict.getTermFreq(word) > 0 || nGramBloomFilter.isKnownNGram(word)) { - if (isNewKeywordForLink(word, linkUrl.toString())) { - linkKeywordConsumer.accept(linkUrl, word); - } - } - } - - Pattern p = Pattern.compile("\\."); - private String removePeriods(String s) { - return p.matcher(s).replaceAll(""); - } - - private String domainHash(EdgeUrl url) { - var domain = url.domain; - if ("www".equals(domain.subDomain)) { - return domain.domain; - } - return domain.toString(); - } - - private String trimText(String text) { - int start = text.length()-1; - int end = 0; - - for (int i = text.length(); i > 0; i--) { - if (Character.isLetterOrDigit(text.charAt(i-1))) { - end = i; - break; - } - } - - for (int i = 0; i < end; i++) { - if (Character.isLetterOrDigit(text.charAt(i))) { - start = i; - break; - } - } - - if (start >= 0 && start < end) { - return text.substring(start, end); - } - - return ""; - } - - // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine - private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate(); - - private boolean isInterestingAnchorText(String text) { - if (text.isBlank()) return false; - if (text.length() > 32) return false; - - // Google loves questions, and so does SEO spammers - if (text.endsWith("?")) return false; - - if (text.startsWith("http:") || text.startsWith("https:")) return false; - - if (looksLikeAnURL.test(text)) return false; - - return switch (text) { - case "this", "here", "click", "click here", "download", "source" -> false; - default -> true; - }; - } - - private boolean isInterestingAnchorLink(EdgeUrl linkUrl) { - if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) { - return false; - } - - if (!includeUrlPredicate.test(linkUrl)) { - return false; - } - - return includeDomainPredicate.test(linkUrl.domain.toString()); - } - - private synchronized boolean isNewKeywordForLink(String href, String text) { - long hash = 0; - - hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong(); - hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).padToLong(); - - // Remove sign bit because we don't want a negative index in deduplicateHashBitset - hash &= 0x7FFF_FFFF_FFFF_FFFFL; - - return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/RedirectCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/RedirectCompiler.java deleted file mode 100644 index 2a1e42f7..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/RedirectCompiler.java +++ /dev/null @@ -1,19 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.compiler; - -import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainRedirect; -import nu.marginalia.wmsa.edge.model.EdgeDomain; - -import java.util.List; - -public class RedirectCompiler { - - public void compile(List ret, EdgeDomain from, EdgeDomain to) { - ret.add(new LoadDomain(to)); - ret.add(new LoadDomainLink(new DomainLink(from, to))); - ret.add(new LoadDomainRedirect(new DomainLink(from, to))); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java deleted file mode 100644 index 5e9c3e4d..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java +++ /dev/null @@ -1,25 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.interpreter; - -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; - -public interface Interpreter { - void loadUrl(EdgeUrl[] url); - void loadDomain(EdgeDomain[] domain); - void loadRssFeed(EdgeUrl[] rssFeed); - void loadDomainLink(DomainLink[] links); - - void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip); - void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); - void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); - - void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words); - - void loadDomainRedirect(DomainLink link); -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DomainLink.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DomainLink.java deleted file mode 100644 index 338e345c..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DomainLink.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.interpreter.instruction; - -import nu.marginalia.wmsa.edge.model.EdgeDomain; - -public record DomainLink(EdgeDomain from, EdgeDomain to) { -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java deleted file mode 100644 index 71ac3945..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ /dev/null @@ -1,425 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.processor; - -import com.google.common.hash.HashCode; -import com.google.inject.Inject; -import com.google.inject.name.Named; -import nu.marginalia.util.gregex.GuardedRegex; -import nu.marginalia.util.gregex.GuardedRegexFactory; -import nu.marginalia.util.language.LanguageFilter; -import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.util.language.processing.model.KeywordMetadata; -import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; -import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails; -import nu.marginalia.wmsa.edge.converting.processor.logic.*; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; -import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentFlags; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; -import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.net.URISyntaxException; -import java.nio.file.Path; -import java.util.*; - -import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.UNKNOWN; - -public class DocumentProcessor { - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final int minDocumentLength; - private final double minDocumentQuality; - - private static final Set acceptedContentTypes = Set.of("application/xhtml+xml", "application/xhtml", "text/html"); - - private final SentenceExtractor sentenceExtractor; - private final FeatureExtractor featureExtractor; - private final TitleExtractor titleExtractor; - private final DocumentKeywordExtractor keywordExtractor; - private final SummaryExtractor summaryExtractor; - private final PubDateSniffer pubDateSniffer; - - private static final DocumentValuator documentValuator = new DocumentValuator(); - private static final LanguageFilter languageFilter = new LanguageFilter(); - private static final LinkParser linkParser = new LinkParser(); - private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser); - - @Inject - public DocumentProcessor(@Named("min-document-length") Integer minDocumentLength, - @Named("min-document-quality") Double minDocumentQuality, - SentenceExtractor sentenceExtractor, - FeatureExtractor featureExtractor, - TitleExtractor titleExtractor, - DocumentKeywordExtractor keywordExtractor, - SummaryExtractor summaryExtractor, - PubDateSniffer pubDateSniffer) - { - this.minDocumentLength = minDocumentLength; - this.minDocumentQuality = minDocumentQuality; - this.sentenceExtractor = sentenceExtractor; - this.featureExtractor = featureExtractor; - this.titleExtractor = titleExtractor; - this.keywordExtractor = keywordExtractor; - this.summaryExtractor = summaryExtractor; - this.pubDateSniffer = pubDateSniffer; - } - - public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) { - ProcessedDocument ret = new ProcessedDocument(); - - try { - ret.state = EdgeUrlState.DISQUALIFIED; - ret.url = getDocumentUrl(crawledDocument); - } - catch (Exception ex) {} - - return ret; - } - - public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) { - ProcessedDocument ret = new ProcessedDocument(); - - try { - processDocument(crawledDocument, crawledDomain, ret); - } - catch (DisqualifiedException ex) { - ret.state = EdgeUrlState.DISQUALIFIED; - ret.stateReason = ex.reason.toString(); - logger.debug("Disqualified {}: {}", ret.url, ex.reason); - } - catch (Exception ex) { - ret.state = EdgeUrlState.DISQUALIFIED; - ret.stateReason = DisqualificationReason.PROCESSING_EXCEPTION.toString(); - logger.info("Failed to convert " + crawledDocument.url, ex); - ex.printStackTrace(); - } - - return ret; - } - - private void processDocument(CrawledDocument crawledDocument, CrawledDomain crawledDomain, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { - - var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); - if (crawlerStatus != CrawlerDocumentStatus.OK) { - throw new DisqualifiedException(crawlerStatus); - } - - if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) { - throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); - } - - if (!isAcceptedContentType(crawledDocument)) { - throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE); - } - - - ret.url = getDocumentUrl(crawledDocument); - ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus); - - var detailsWithWords = createDetails(crawledDomain, crawledDocument); - - ret.details = detailsWithWords.details(); - ret.words = detailsWithWords.words(); - } - - - private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument) - throws URISyntaxException - { - if (crawledDocument.canonicalUrl != null) { - try { - return new EdgeUrl(crawledDocument.canonicalUrl); - } - catch (URISyntaxException ex) { /* fallthrough */ } - } - - return new EdgeUrl(crawledDocument.url); - } - - public static boolean isAcceptedContentType(CrawledDocument crawledDocument) { - if (crawledDocument.contentType == null) { - return false; - } - - var ct = crawledDocument.contentType; - - if (acceptedContentTypes.contains(ct)) - return true; - - if (ct.contains(";")) { - return acceptedContentTypes.contains(ct.substring(0, ct.indexOf(';'))); - } - return false; - } - - private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) { - return switch (CrawlerDocumentStatus.valueOf(crawlerStatus)) { - case OK -> httpStatus < 300 ? EdgeUrlState.OK : EdgeUrlState.DEAD; - case REDIRECT -> EdgeUrlState.REDIRECT; - default -> EdgeUrlState.DEAD; - }; - } - - private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) - throws DisqualifiedException, URISyntaxException { - - String documentBody = crawledDocument.documentBody.decode(); - - if (languageFilter.isBlockedUnicodeRange(documentBody)) { - throw new DisqualifiedException(DisqualificationReason.LANGUAGE); - } - - Document doc = Jsoup.parse(documentBody); - - if (AcceptableAds.hasAcceptableAdsTag(doc)) { - // I've never encountered a website where this hasn't been a severe indicator - // of spam - - throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); - } - - if (doc.select("meta[name=robots]").attr("content").contains("noindex")) { - throw new DisqualifiedException(DisqualificationReason.FORBIDDEN); - } - - final EdgeUrl url = new EdgeUrl(crawledDocument.url); - - Document prunedDoc = doc.clone(); - - prunedDoc.getElementsByTag("svg").remove(); - prunedDoc.body().filter(new DomPruningFilter(0.5)); - - var dld = sentenceExtractor.extractSentences(prunedDoc); - - checkDocumentLanguage(dld); - - var ret = new ProcessedDocumentDetails(); - - ret.length = getLength(doc); - ret.standard = getHtmlStandard(doc); - ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); - - ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld); - ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong(); - - KeywordMetadata keywordMetadata = new KeywordMetadata(); - - PubDate pubDate; - EdgePageWords words; - if (shouldDoSimpleProcessing(url, dld, ret)) { - /* Some documents we'll index, but only superficially. This is a compromise - to allow them to be discoverable, without having them show up without specific - queries. This also saves a lot of processing power. - */ - ret.features = Set.of(HtmlFeature.UNKNOWN); - words = keywordExtractor.extractKeywordsMinimal(dld, keywordMetadata); - ret.description = ""; - - pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, false); - - ret.metadata = new EdgePageDocumentsMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.Simple)); - - } - else { - ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); - words = keywordExtractor.extractKeywords(dld, keywordMetadata); - ret.description = getDescription(doc); - - pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); - - ret.metadata = new EdgePageDocumentsMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class)); - } - - addMetaWords(ret, url, pubDate, crawledDomain, words); - - getLinks(url, ret, doc, words); - - if (pubDate.hasYear()) { - ret.pubYear = pubDate.year(); - } - - return new DetailsWithWords(ret, words); - } - - private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$"); - - private boolean shouldDoSimpleProcessing(EdgeUrl url, DocumentLanguageData dld, ProcessedDocumentDetails ret) { - if (ret.quality < minDocumentQuality) { - return true; - } - if (dld.totalNumWords() < minDocumentLength) { - return true; - } - // These pages shouldn't be publicly accessible - if ("phpinfo()".equals(ret.title)) { - return true; - } - - // Urls that look like /@foo are typically Mastodon or other twitter-like feeds, - // we don't want to index them because they change so rapidly; subdirectories are - // fine though - // - if (mastodonFeedRegex.test(url.path)) { - return true; - } - - // Annoying wordpress crap - if (url.path.startsWith("/tag/") && url.path.endsWith("/")) { - return true; - } - return false; - } - - private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, PubDate pubDate, CrawledDomain domain, EdgePageWords words) { - List tagWords = new ArrayList<>(); - - var edgeDomain = url.domain; - tagWords.add("format:"+ret.standard.toString().toLowerCase()); - - tagWords.add("site:" + edgeDomain.toString().toLowerCase()); - if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) { - tagWords.add("site:" + edgeDomain.domain.toLowerCase()); - } - - tagWords.add("tld:" + edgeDomain.getTld()); - - tagWords.add("proto:"+url.proto.toLowerCase()); - tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase()); - - if (domain.ip != null) { - tagWords.add("ip:" + domain.ip.toLowerCase()); // lower case because IPv6 is hexadecimal - } - - ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); - - if (pubDate.year() > 1900) { - tagWords.add("year:" + pubDate.year()); - } - if (pubDate.dateIso8601() != null) { - tagWords.add("pub:" + pubDate.dateIso8601()); - } - - words.addAllSyntheticTerms(tagWords); - } - - private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWords words) { - - final LinkProcessor lp = new LinkProcessor(ret, baseUrl); - - baseUrl = linkParser.getBaseLink(doc, baseUrl); - - EdgeDomain domain = baseUrl.domain; - - for (var atag : doc.getElementsByTag("a")) { - var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag); - if (linkParser.shouldIndexLink(atag)) { - linkOpt.ifPresent(lp::accept); - } - else { - linkOpt - .filter(url -> linkParser.hasBinarySuffix(url.path.toLowerCase())) - .ifPresent(lp::acceptNonIndexable); - } - } - for (var frame : doc.getElementsByTag("frame")) { - linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); - } - for (var frame : doc.getElementsByTag("iframe")) { - linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); - } - for (var link : doc.select("link[rel=alternate]")) { - feedExtractor - .getFeedFromAlternateTag(baseUrl, link) - .ifPresent(lp::acceptFeed); - } - - createLinkKeywords(words, lp); - createFileLinkKeywords(words, lp, domain); - } - - private void createLinkKeywords(EdgePageWords words, LinkProcessor lp) { - final Set linkTerms = new HashSet<>(); - - for (var fd : lp.getForeignDomains()) { - linkTerms.add("links:"+fd.toString().toLowerCase()); - linkTerms.add("links:"+fd.getDomain().toLowerCase()); - } - words.addAllSyntheticTerms(linkTerms); - } - - private void createFileLinkKeywords(EdgePageWords words, LinkProcessor lp, EdgeDomain domain) { - Set fileKeywords = new HashSet<>(100); - for (var link : lp.getNonIndexableUrls()) { - - if (!domain.hasSameTopDomain(link.domain)) { - continue; - } - - synthesizeFilenameKeyword(fileKeywords, link); - - } - - words.addAllSyntheticTerms(fileKeywords); - } - - private void synthesizeFilenameKeyword(Set fileKeywords, EdgeUrl link) { - - Path pFilename = Path.of(link.path.toLowerCase()).getFileName(); - - if (pFilename == null) return; - - String filename = pFilename.toString(); - if (filename.length() > 32 - || filename.endsWith(".xml") - || filename.endsWith(".jpg") - || filename.endsWith(".png") - || filename.endsWith(".pdf") - || filename.endsWith(".gif")) - return; - - fileKeywords.add(filename.replace(' ', '_')); - } - - private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { - double languageAgreement = languageFilter.dictionaryAgreement(dld); - if (languageAgreement < 0.1) { - throw new DisqualifiedException(DisqualificationReason.LANGUAGE); - } - } - - private EdgeHtmlStandard getHtmlStandard(Document doc) { - EdgeHtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType()); - - if (UNKNOWN.equals(htmlStandard)) { - return HtmlStandardExtractor.sniffHtmlStandard(doc); - } - return htmlStandard; - } - - private String getDescription(Document doc) { - return summaryExtractor.extractSummary(doc); - } - - private int getLength(Document doc) { - return doc.text().length(); - } - - private record DetailsWithWords(ProcessedDocumentDetails details, - EdgePageWords words) {} - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateEffortLevel.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateEffortLevel.java deleted file mode 100644 index b146c0d0..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateEffortLevel.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; - -public enum PubDateEffortLevel { - LOW, - HIGH -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java deleted file mode 100644 index e3d0e556..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java +++ /dev/null @@ -1,23 +0,0 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; - -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; -import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; -import org.jsoup.nodes.Document; - -import java.util.Optional; - -public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic { - - @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { - if (htmlStandard == EdgeHtmlStandard.UNKNOWN) - return Optional.empty(); - - return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard))); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerTestMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerTestMain.java deleted file mode 100644 index b26f501a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerTestMain.java +++ /dev/null @@ -1,116 +0,0 @@ -package nu.marginalia.wmsa.edge.crawling; - -import io.github.bucket4j.Bandwidth; -import io.github.bucket4j.Bucket; -import io.github.bucket4j.Refill; -import spark.Request; -import spark.Response; -import spark.Spark; - -import java.time.Duration; -import java.util.ArrayList; -import java.util.List; - -public class CrawlerTestMain { - - static Bucket rateLimiter60RPM; - static List successfullyFetched = new ArrayList<>(); - - public static void main(String... args) { - var refill = Refill.greedy(1, Duration.ofSeconds(1)); - - var bw = Bandwidth.classic(10, refill); - rateLimiter60RPM = Bucket.builder().addLimit(bw).build(); - - Spark.port(8080); - Spark.before(CrawlerTestMain::before); - Spark.after(CrawlerTestMain::after); - Spark.get("/rate-limit/", CrawlerTestMain::index); - Spark.get("/rate-limit/:n", CrawlerTestMain::n); - - Spark.before("/rate-limit/:n", CrawlerTestMain::rateLimitRequest); - Spark.before("/intermittent-error/:n", CrawlerTestMain::simulateRandomTimeouts); - - Spark.get("/intermittent-error/", CrawlerTestMain::index); - Spark.get("/intermittent-error/:n", CrawlerTestMain::n); - - } - - private static void rateLimitRequest(Request request, Response response) { - if (!rateLimiter60RPM.tryConsume(1)) { - Spark.halt(429); - } - } - - private static void simulateRandomTimeouts(Request request, Response response) { - if (Math.random() < 0.25) { - System.out.println("Simulating error"); - Spark.halt(503); - } - } - - public static void before(Request request, Response response) { - System.out.println(request.pathInfo()); - successfullyFetched.add(request.pathInfo()); - } - public static void after(Request request, Response response) { - if (response.status() < 300) { - successfullyFetched.add(request.pathInfo()); - } - } - - private static Object n(Request request, Response response) { - - int num = Integer.parseInt(request.params("n")); - return """ - - - Index - -

Index

- """ + - String.format("Next, Next 2", num+1, num+2) - - + - """ - -

- Goddess, sing me the anger, of Achilles, Peleus’ son, that fatal anger that brought countless - sorrows on the Greeks, and sent many valiant souls of warriors down to Hades, leaving their - bodies as spoil for dogs and carrion birds: for thus was the will of Zeus brought to fulfilment. - - Sing of it from the moment when Agamemnon, Atreus’ son, that king of men, parted in wrath from noble Achilles. - Which of the gods set these two to quarrel? Apollo, the son of Leto and Zeus, angered by the king, brought an - evil plague on the army, so that the men were dying, for the son of Atreus had dishonoured Chryses the priest. - He it was who came to the swift Achaean ships, to free his daughter, bringing a wealth of ransom, carrying a - golden staff adorned with the ribbons of far-striking Apollo, and called out to the Achaeans, above all to the - two leaders of armies, those sons of Atreus: ‘Atreides, and all you bronze-greaved Achaeans, may the gods who - live on Olympus grant you to sack Priam’s city, and sail back home in safety; but take this ransom, and free - my darling child; show reverence for Zeus’s son, far-striking Apollo.’ - """; - } - - private static Object index(Request request, Response response) { - return """ - - - Index - -

Index

- Next -

- Goddess, sing me the anger, of Achilles, Peleus’ son, that fatal anger that brought countless - sorrows on the Greeks, and sent many valiant souls of warriors down to Hades, leaving their - bodies as spoil for dogs and carrion birds: for thus was the will of Zeus brought to fulfilment. - - Sing of it from the moment when Agamemnon, Atreus’ son, that king of men, parted in wrath from noble Achilles. - Which of the gods set these two to quarrel? Apollo, the son of Leto and Zeus, angered by the king, brought an - evil plague on the army, so that the men were dying, for the son of Atreus had dishonoured Chryses the priest. - He it was who came to the swift Achaean ships, to free his daughter, bringing a wealth of ransom, carrying a - golden staff adorned with the ribbons of far-striking Apollo, and called out to the Achaeans, above all to the - two leaders of armies, those sons of Atreus: ‘Atreides, and all you bronze-greaved Achaeans, may the gods who - live on Olympus grant you to sack Priam’s city, and sail back home in safety; but take this ransom, and free - my darling child; show reverence for Zeus’s son, far-striking Apollo.’ - """; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDao.java deleted file mode 100644 index 47053837..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDao.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.wmsa.edge.dbcommon; - -import com.google.inject.ImplementedBy; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeId; -import nu.marginalia.wmsa.edge.model.id.EdgeIdCollection; -import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; -import nu.marginalia.wmsa.edge.search.model.BrowseResult; - -import java.util.List; -import java.util.Optional; - -@ImplementedBy(EdgeDataStoreDaoImpl.class) -public interface EdgeDataStoreDao { - EdgeId getDomainId(EdgeDomain domain); - - List getDomainNeighborsAdjacentCosine(EdgeId domainId, EdgeDomainBlacklist blacklist, int count); - - List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist backlist, int count); - - List getRandomDomains(int count, EdgeDomainBlacklist backlist, int set); - - List getBrowseResultFromUrlIds(EdgeIdCollection urlId); - - List getUrlDetailsMulti(EdgeIdCollection ids); - - Optional getDomain(EdgeId id); - - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java deleted file mode 100644 index 87f65926..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java +++ /dev/null @@ -1,30 +0,0 @@ -package nu.marginalia.wmsa.edge.index; - - -import com.google.inject.Inject; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; - -import java.io.IOException; - - -public class EdgeIndexControl { - - private final IndexServicesFactory servicesFactory; - private final EdgeIndexSearchSetsService searchSetsService; - - @Inject - public EdgeIndexControl(IndexServicesFactory servicesFactory, EdgeIndexSearchSetsService searchSetsService) { - this.servicesFactory = servicesFactory; - this.searchSetsService = searchSetsService; - } - - public void regenerateIndex() throws IOException { - servicesFactory.convertIndex(searchSetsService.getDomainRankings()); - - System.gc(); - } - - public void switchIndexFiles() throws Exception { - servicesFactory.switchFilesJob().call(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexMain.java deleted file mode 100644 index 65dde030..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexMain.java +++ /dev/null @@ -1,34 +0,0 @@ -package nu.marginalia.wmsa.edge.index; - -import com.google.inject.Guice; -import com.google.inject.Inject; -import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.configuration.server.Initialization; - -public class EdgeIndexMain extends MainClass { - private final EdgeIndexService service; - - @Inject - public EdgeIndexMain(EdgeIndexService service) { - this.service = service; - } - - public static void main(String... args) { - init(ServiceDescriptor.EDGE_INDEX, args); - - Injector injector = Guice.createInjector( - new EdgeIndexTablesModule(), - new EdgeIndexModule(), - new DatabaseModule(), - new ConfigurationModule() - ); - - injector.getInstance(EdgeIndexMain.class); - injector.getInstance(Initialization.class).setReady(); - - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java deleted file mode 100644 index 99a1e3f4..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java +++ /dev/null @@ -1,23 +0,0 @@ -package nu.marginalia.wmsa.edge.index; - -import com.google.inject.AbstractModule; -import com.google.inject.Provides; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.index.config.RankingSettings; - -import java.nio.file.Path; - -public class EdgeIndexModule extends AbstractModule { - - - - public void configure() { - } - - @Provides - public RankingSettings rankingSettings() { - Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml"); - return RankingSettings.from(dir); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java deleted file mode 100644 index 6cb5ba36..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ /dev/null @@ -1,84 +0,0 @@ -package nu.marginalia.wmsa.edge.index; - -import com.google.gson.Gson; -import com.google.inject.Inject; -import com.google.inject.name.Named; -import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexDomainQueryService; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService; -import org.jetbrains.annotations.NotNull; -import spark.Request; -import spark.Response; -import spark.Spark; - -import java.util.concurrent.TimeUnit; - -import static spark.Spark.get; - -public class EdgeIndexService extends Service { - - @NotNull - private final Initialization init; - private final SearchIndexControl indexes; - - - @Inject - public EdgeIndexService(@Named("service-host") String ip, - @Named("service-port") Integer port, - Initialization init, - MetricsServer metricsServer, - SearchIndexControl indexes, - - EdgeIndexOpsService opsService, - EdgeIndexLexiconService lexiconService, - EdgeIndexQueryService indexQueryService, - EdgeIndexDomainQueryService domainQueryService - ) - { - super(ip, port, init, metricsServer); - - final Gson gson = GsonFactory.get(); - - this.init = init; - this.indexes = indexes; - - Spark.post("/words/", lexiconService::putWords); - - Spark.post("/search/", indexQueryService::search, gson::toJson); - Spark.post("/search-domain/", domainQueryService::searchDomain, gson::toJson); - - Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson); - - Spark.post("/ops/repartition", opsService::repartitionEndpoint); - Spark.post("/ops/reindex", opsService::reindexEndpoint); - - get("/is-blocked", this::isBlocked, gson::toJson); - - Schedulers.newThread().scheduleDirect(this::initialize, 1, TimeUnit.MICROSECONDS); - } - - private Object isBlocked(Request request, Response response) { - return indexes.isBusy() || !initialized; - } - - volatile boolean initialized = false; - public void initialize() { - if (!initialized) { - init.waitReady(); - initialized = true; - indexes.initialize(init); - } - - } - - -} - - diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexTablesModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexTablesModule.java deleted file mode 100644 index 93014e4c..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexTablesModule.java +++ /dev/null @@ -1,29 +0,0 @@ -package nu.marginalia.wmsa.edge.index; - -import com.google.inject.AbstractModule; -import com.google.inject.name.Names; -import nu.marginalia.wmsa.configuration.WmsaHome; - -import java.nio.file.Path; - -public class EdgeIndexTablesModule extends AbstractModule { - - public void configure() { - bind(Path.class).annotatedWith(Names.named("partition-root-slow")).toInstance(WmsaHome.getDisk("index-write")); - bind(Path.class).annotatedWith(Names.named("partition-root-fast")).toInstance(WmsaHome.getDisk("index-read")); - - bind(Path.class).annotatedWith(Names.named("partition-root-slow-tmp")).toInstance(WmsaHome.getDisk("tmp-slow")); - bind(Path.class).annotatedWith(Names.named("tmp-file-dir")).toInstance(WmsaHome.getDisk("tmp-fast")); - - bind(String.class).annotatedWith(Names.named("edge-writer-page-index-file")).toInstance("page-index.dat"); - bind(String.class).annotatedWith(Names.named("edge-writer-dictionary-file")).toInstance("dictionary.dat"); - - bind(String.class).annotatedWith(Names.named("edge-index-write-words-file")).toInstance("words.dat.wip"); - bind(String.class).annotatedWith(Names.named("edge-index-write-urls-file")).toInstance("urls.dat.wip"); - - bind(String.class).annotatedWith(Names.named("edge-index-read-words-file")).toInstance("words.dat"); - bind(String.class).annotatedWith(Names.named("edge-index-read-urls-file")).toInstance("urls.dat"); - - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java deleted file mode 100644 index 32094fd9..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java +++ /dev/null @@ -1,87 +0,0 @@ -package nu.marginalia.wmsa.edge.index.client; - -import com.google.inject.Singleton; -import io.prometheus.client.Summary; -import io.reactivex.rxjava3.core.Observable; -import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.client.AbstractDynamicClient; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeId; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; -import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; -import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; -import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq; - -import javax.annotation.CheckReturnValue; -import java.util.List; -import java.util.concurrent.TimeUnit; - -@Singleton -public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexWriterClient { - - private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register(); - - public EdgeIndexClient() { - super(ServiceDescriptor.EDGE_INDEX); - setTimeout(30); - } - - @Override - public void putWords(Context ctx, EdgeId domain, EdgeId url, - EdgePageDocumentsMetadata metadata, - DocumentKeywords wordSet, int writer - ) - { - - var keywordBuilder = - IndexPutKeywordsReq.newBuilder() - .setDomain(domain.id()) - .setUrl(url.id()) - .setMetadata(metadata.encode()) - .setIndex(writer); - - var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder(); - wordSetBuilder.addAllWords(List.of(wordSet.keywords())); - for (var meta : wordSet.metadata()) { - wordSetBuilder.addMeta(meta); - } - keywordBuilder.addWordSet(wordSetBuilder.build()); - - var req = keywordBuilder.build(); - - this.post(ctx, "/words/", req).blockingSubscribe(); - } - - - @CheckReturnValue - public List query(Context ctx, EdgeSearchSpecification specs) { - return wmsa_search_index_api_time.time( - () -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults() - ); - } - - @CheckReturnValue - public List queryDomains(Context ctx, List specs) { - return Observable.fromIterable(specs) - .concatMap(s -> postGet(ctx, "/search-domain/", s, EdgeDomainSearchResults.class) - .subscribeOn(Schedulers.io()) - .timeout(1, TimeUnit.SECONDS) - .onErrorComplete()) - .toList() - .blockingGet(); - } - - - @CheckReturnValue - public Observable isBlocked(Context ctx) { - return super.get(ctx, "/is-blocked", Boolean.class); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java deleted file mode 100644 index 585c9a14..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java +++ /dev/null @@ -1,88 +0,0 @@ -package nu.marginalia.wmsa.edge.index.client; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import com.google.inject.name.Named; -import nu.marginalia.util.dict.OffHeapDictionaryHashMap; -import nu.marginalia.util.dict.DictionaryMap; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; -import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; -import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeId; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Arrays; - -@Singleton -public class EdgeIndexLocalService implements EdgeIndexWriterClient { - - private final KeywordLexicon lexicon; - private final SearchIndexJournalWriterImpl indexWriter; - private static final Logger logger = LoggerFactory.getLogger(EdgeIndexLocalService.class); - - @Inject - public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException { - - var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile()); - lexicon = new KeywordLexicon(lexiconJournal, DictionaryMap.create()); - indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile()); - } - - public void putWords(Context ctx, EdgeId domain, EdgeId url, - EdgePageDocumentsMetadata metadata, - DocumentKeywords wordSet, int writer) { - if (wordSet.keywords().length == 0) - return; - - if (domain.id() <= 0 || url.id() <= 0) { - logger.warn("Bad ID: {}:{}", domain, url); - return; - } - - for (var chunk : KeywordListChunker.chopList(wordSet, SearchIndexJournalEntry.MAX_LENGTH)) { - - var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata())); - var header = new SearchIndexJournalEntryHeader(domain, url, metadata.encode()); - - indexWriter.put(header, entry); - } - - } - - private long[] getOrInsertWordIds(String[] words, long[] meta) { - long[] ids = new long[words.length*2]; - int putIdx = 0; - - for (int i = 0; i < words.length; i++) { - String word = words[i]; - - long id = lexicon.getOrInsert(word); - if (id != OffHeapDictionaryHashMap.NO_VALUE) { - ids[putIdx++] = id; - ids[putIdx++] = meta[i]; - } - } - - if (putIdx != words.length*2) { - ids = Arrays.copyOf(ids, putIdx); - } - return ids; - } - - @Override - public void close() throws Exception { - indexWriter.close(); - lexicon.close(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexWriterClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexWriterClient.java deleted file mode 100644 index ff405e7a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexWriterClient.java +++ /dev/null @@ -1,14 +0,0 @@ -package nu.marginalia.wmsa.edge.index.client; - -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeId; - -public interface EdgeIndexWriterClient extends AutoCloseable { - - void putWords(Context ctx, EdgeId domain, EdgeId url, EdgePageDocumentsMetadata metadata, - DocumentKeywords wordSets, int writer); -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/config/RankingSettingsEntry.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/config/RankingSettingsEntry.java deleted file mode 100644 index f6aa501f..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/config/RankingSettingsEntry.java +++ /dev/null @@ -1,8 +0,0 @@ -package nu.marginalia.wmsa.edge.index.config; - -import java.util.List; - -public class RankingSettingsEntry { - public List domains; - public int max; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexMetadataService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexMetadataService.java deleted file mode 100644 index 6a359d0b..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexMetadataService.java +++ /dev/null @@ -1,22 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings; - -public class IndexMetadataService { - private final SearchIndexControl indexes; - - public IndexMetadataService(SearchIndexControl indexes) { - this.indexes = indexes; - } - - public long getDocumentMetadata(long urlId) { - return indexes.getIndex().getDocumentMetadata(urlId); - } - - public int getDomainId(long urlId) { - return indexes.getIndex().getDomainId(urlId); - } - - public long[] getTermMetadata(int termId, long[] docIdsAll) { - return indexes.getIndex().getTermMetadata(termId, docIdsAll); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java deleted file mode 100644 index 42e7e32f..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java +++ /dev/null @@ -1,77 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; -import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; -import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -@Singleton -public class SearchIndexControl { - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final IndexServicesFactory servicesFactory; - private final SearchIndexJournalWriterImpl primaryIndexWriter; - private final SearchIndexJournalWriterImpl secondaryIndexWriter; - private volatile KeywordLexiconReadOnlyView keywordLexiconReadOnlyView; - - private final SearchIndex index; - private final EdgeOpsLockService opsLockService; - - @Inject - public SearchIndexControl(IndexServicesFactory servicesFactory, - EdgeOpsLockService opsLockService, - EdgeIndexSearchSetsService searchSetsService) { - this.servicesFactory = servicesFactory; - - this.primaryIndexWriter = servicesFactory.getIndexWriter(0); - this.secondaryIndexWriter = servicesFactory.getIndexWriter(1); - - index = servicesFactory.createIndexBucket(searchSetsService); - this.opsLockService = opsLockService; - } - - public boolean reindex() throws Exception { - return opsLockService.run(index::switchIndex).isPresent(); - } - - public boolean isBusy() { - return opsLockService.isLocked(); - } - - @Nullable - public KeywordLexiconReadOnlyView getLexiconReader() { - return keywordLexiconReadOnlyView; - } - - public void initialize(Initialization init) { - - logger.info("Waiting for init"); - init.waitReady(); - - if (!opsLockService.run(index::init)) throw new IllegalStateException("Failed to initialize " + getClass().getSimpleName()); - keywordLexiconReadOnlyView = servicesFactory.getDictionaryReader(); - } - - public SearchIndexJournalWriterImpl getIndexWriter(int idx) { - if (idx == 0) { - return primaryIndexWriter; - } - else { - return secondaryIndexWriter; - } - } - - public SearchIndex getIndex() { - return index; - } - - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalEntryHeader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalEntryHeader.java deleted file mode 100644 index b5c4d554..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalEntryHeader.java +++ /dev/null @@ -1,22 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings.journal.model; - -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeId; - -public record SearchIndexJournalEntryHeader(int entrySize, long documentId, long documentMeta) { - - public static final int HEADER_SIZE_LONGS = 3; - - public SearchIndexJournalEntryHeader( EdgeId domainId, EdgeId urlId, long documentMeta) { - this(-1, combineIds(domainId, urlId), documentMeta); - } - - private static long combineIds(EdgeId domainId, EdgeId urlId) { - long did = domainId.id(); - long uid = urlId.id(); - - return (did << 32L) | uid; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalFileHeader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalFileHeader.java deleted file mode 100644 index 4b65505c..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalFileHeader.java +++ /dev/null @@ -1,4 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings.journal.model; - -public record SearchIndexJournalFileHeader(long fileSize, long wordCount) { -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalStatistics.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalStatistics.java deleted file mode 100644 index 8b827174..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalStatistics.java +++ /dev/null @@ -1,3 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings.journal.model; - -public record SearchIndexJournalStatistics(int highestWord, int documentCardinality) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalCleaner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalCleaner.java deleted file mode 100644 index 8e685a2a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalCleaner.java +++ /dev/null @@ -1,71 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings.journal.reader; - -import nu.marginalia.util.array.LongArray; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.LongBuffer; -import java.nio.file.Path; -import java.util.function.Predicate; - -public class SearchIndexJournalCleaner { - private final SearchIndexJournalReader reader; - - public SearchIndexJournalCleaner(SearchIndexJournalReader reader) { - this.reader = reader; - } - - private long dryRunForNewSize(Predicate entryPredicate) { - long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS; - - var pt = new ProgressTracker(); - - for (var entry : reader) { - if (entryPredicate.test(entry)) { - pos += entry.totalEntrySizeLongs(); - pt.update(pos); - } - } - - return pos; - } - - public void clean(Path outFile, Predicate entryPredicate) throws IOException { - - System.out.println("Dry run"); - long size = dryRunForNewSize(entryPredicate); - - System.out.println("Copying"); - LongArray outputArray = LongArray.mmapForWriting(outFile, size); - - long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS; - var pt = new ProgressTracker(); - - LongBuffer adequateBuffer = ByteBuffer.allocateDirect(100*1024*1024).asLongBuffer(); - - for (var entry : reader) { - if (entryPredicate.test(entry)) { - pos += entry.copyTo(pos, adequateBuffer, outputArray); - pt.update(pos); - } - } - - outputArray.set(0, pos*8); - outputArray.set(1, reader.fileHeader().wordCount()); - - outputArray.force(); - } -} - -class ProgressTracker { - long stepSize = 100*1024*1024; - long pos = 0; - - public void update(long pos) { - if (this.pos / stepSize != pos / stepSize) { - System.out.printf("%d Mb\n", (800*pos)/stepSize); - } - this.pos = pos; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java deleted file mode 100644 index 97cd9e98..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java +++ /dev/null @@ -1,98 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings.journal.reader; - -import nu.marginalia.util.array.LongArray; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; - -import java.nio.ByteBuffer; -import java.nio.LongBuffer; - -public class SearchIndexJournalReadEntry { - private final long offset; - public final SearchIndexJournalEntryHeader header; - private final LongArray map; - private final long committedSize; - - SearchIndexJournalReadEntry(long offset, LongArray map, long committedSize) { - this.map = map; - this.committedSize = committedSize; - - final long sizeBlock = this.map.get(offset); - final long docId = this.map.get(offset + 1); - final long meta = this.map.get(offset + 2); - - this.offset = offset; - this.header = new SearchIndexJournalEntryHeader( - (int) (sizeBlock >>> 32L), - docId, - meta); - } - - public boolean hasNext() { - return nextId() < committedSize; - } - - public long docId() { - return header.documentId(); - } - - public long docMeta() { - return header.documentMeta(); - } - - public int domainId() { - return (int) (docId() >>> 32L); - } - - public int urlId() { - return (int) (docId() & 0xFFFF_FFFFL); - } - - public int wordCount() { - return header.entrySize() / SearchIndexJournalEntry.ENTRY_SIZE; - } - - public SearchIndexJournalEntry readEntry() { - long[] dest = new long[header.entrySize()]; - - long offsetStart = offset + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; - long offsetEnd = offsetStart + header.entrySize(); - - map.get(offsetStart, offsetEnd, dest); - - return new SearchIndexJournalEntry(header.entrySize(), dest); - } - - public SearchIndexJournalEntry readEntryUsingBuffer(long[] dest) { - if (dest.length >= header.entrySize()) { - long offsetStart = offset + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; - long offsetEnd = offsetStart + header.entrySize(); - - map.get(offsetStart, offsetEnd, dest); - return new SearchIndexJournalEntry(header.entrySize(), dest); - } else { - return readEntry(); - } - } - - public long nextId() { - return offset + totalEntrySizeLongs(); - } - - public SearchIndexJournalReadEntry next() { - return new SearchIndexJournalReadEntry(nextId(), map, committedSize); - } - - public long copyTo(long pos, LongBuffer adequateBuffer, LongArray destArray) { - long size = totalEntrySizeLongs(); - - map.get(offset, offset + size, adequateBuffer, 0); - destArray.set(pos, pos + size, adequateBuffer, 0); - - return size; - } - - public long totalEntrySizeLongs() { - return header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java deleted file mode 100644 index a8751f85..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java +++ /dev/null @@ -1,52 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings.journal.reader; - -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalFileHeader; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics; -import org.jetbrains.annotations.NotNull; - -import java.util.Iterator; -import java.util.function.IntConsumer; - -public interface SearchIndexJournalReader extends Iterable { - long FILE_HEADER_SIZE_LONGS = 2; - long FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS; - - default long[] createAdequateTempBuffer() { - return new long[SearchIndexJournalEntry.MAX_LENGTH * SearchIndexJournalEntry.ENTRY_SIZE]; - } - - SearchIndexJournalFileHeader fileHeader(); - - SearchIndexJournalStatistics getStatistics(); - - void forEachWordId(IntConsumer consumer); - - void forEachUrlIdWordId(BiIntConsumer consumer); - - void forEachDocIdWordId(LongIntConsumer consumer); - - void forEachDocIdRecord(LongObjectConsumer consumer); - - void forEachUrlId(IntConsumer consumer); - - @NotNull - @Override - Iterator iterator(); - - interface BiIntConsumer { - void accept(int left, int right); - } - - interface LongIntConsumer { - void accept(long left, int right); - } - - interface LongObjectConsumer { - void accept(long left, T right); - } - - interface IntObjectConsumer { - void accept(int left, T right); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReaderSingleFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReaderSingleFile.java deleted file mode 100644 index 228d9a07..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReaderSingleFile.java +++ /dev/null @@ -1,180 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings.journal.reader; - -import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalFileHeader; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics; -import org.jetbrains.annotations.NotNull; -import org.roaringbitmap.longlong.Roaring64Bitmap; - -import java.io.IOException; -import java.util.Iterator; -import java.util.function.IntConsumer; -import java.util.function.Predicate; - -public class SearchIndexJournalReaderSingleFile implements SearchIndexJournalReader { - - public final SearchIndexJournalFileHeader fileHeader; - - private final LongArray map; - private final long committedSize; - - final Predicate entryPredicate; - final Predicate recordPredicate; - - public SearchIndexJournalReaderSingleFile(LongArray map) throws IOException { - fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1)); - committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS; - - map.advice(NativeIO.Advice.Sequential); - - this.map = map.shifted(FILE_HEADER_SIZE_LONGS); - this.recordPredicate = null; - this.entryPredicate = null; - } - - public SearchIndexJournalReaderSingleFile(LongArray map, Predicate entryPredicate, Predicate recordPredicate) throws IOException { - fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1)); - committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS; - - map.advice(NativeIO.Advice.Sequential); - - this.map = map.shifted(FILE_HEADER_SIZE_LONGS); - - this.recordPredicate = recordPredicate; - this.entryPredicate = entryPredicate; - } - - public SearchIndexJournalFileHeader fileHeader() { - return fileHeader; - } - - public boolean filter(SearchIndexJournalReadEntry entry) { - return entryPredicate == null || entryPredicate.test(entry); - } - - public boolean filter(SearchIndexJournalReadEntry entry, SearchIndexJournalEntry.Record record) { - return (entryPredicate == null || entryPredicate.test(entry)) - && (recordPredicate == null || recordPredicate.test(record)); - } - - @Override - public SearchIndexJournalStatistics getStatistics() { - int highestWord = 0; - final long[] tmpWordsBuffer = createAdequateTempBuffer(); - - // Docs cardinality is a candidate for a HyperLogLog - Roaring64Bitmap docsBitmap = new Roaring64Bitmap(); - - for (var entry : this) { - var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer); - - if (filter(entry)) { - docsBitmap.addLong(entry.docId() & 0x0000_0000_FFFF_FFFFL); - - for (var item : entryData) { - if (filter(entry, item)) { - highestWord = Integer.max(item.wordId(), highestWord); - } - } - } - } - - return new SearchIndexJournalStatistics(highestWord, docsBitmap.getIntCardinality()); - } - - @Override - public void forEachWordId(IntConsumer consumer) { - final long[] tmpWordsBuffer = createAdequateTempBuffer(); - for (var entry : this) { - var data = entry.readEntryUsingBuffer(tmpWordsBuffer); - for (var post : data) { - if (filter(entry, post)) { - consumer.accept(post.wordId()); - } - } - } - } - - @Override - public void forEachUrlIdWordId(BiIntConsumer consumer) { - final long[] tmpWordsBuffer = createAdequateTempBuffer(); - for (var entry : this) { - var data = entry.readEntryUsingBuffer(tmpWordsBuffer); - - for (var post : data) { - if (filter(entry, post)) { - consumer.accept(entry.urlId(), post.wordId()); - } - } - } - } - - @Override - public void forEachDocIdWordId(LongIntConsumer consumer) { - final long[] tmpWordsBuffer = createAdequateTempBuffer(); - for (var entry : this) { - var data = entry.readEntryUsingBuffer(tmpWordsBuffer); - - for (var post : data) { - if (filter(entry, post)) { - consumer.accept(entry.docId(), post.wordId()); - } - } - } - } - - @Override - public void forEachDocIdRecord(LongObjectConsumer consumer) { - final long[] tmpWordsBuffer = createAdequateTempBuffer(); - for (var entry : this) { - var data = entry.readEntryUsingBuffer(tmpWordsBuffer); - - for (var post : data) { - if (filter(entry, post)) { - consumer.accept(entry.docId(), post); - } - } - } - } - @Override - public void forEachUrlId(IntConsumer consumer) { - for (var entry : this) { - if (filter(entry)) { - consumer.accept(entry.urlId()); - } - } - } - - @NotNull - @Override - public Iterator iterator() { - return new JournalEntryIterator(); - } - - private class JournalEntryIterator implements Iterator { - private SearchIndexJournalReadEntry entry; - - @Override - public boolean hasNext() { - if (entry == null) { - return committedSize > 0; - } - - return entry.hasNext(); - } - - @Override - public SearchIndexJournalReadEntry next() { - if (entry == null) { - entry = new SearchIndexJournalReadEntry(0, map, committedSize); - } - else { - entry = entry.next(); - } - return entry; - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriter.java deleted file mode 100644 index 7d765006..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriter.java +++ /dev/null @@ -1,13 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings.journal.writer; - -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; - -public interface SearchIndexJournalWriter { - void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entry); - - void forceWrite(); - - void flushWords(); - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriterImpl.java deleted file mode 100644 index b57a1ea1..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriterImpl.java +++ /dev/null @@ -1,134 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings.journal.writer; - -import io.reactivex.rxjava3.disposables.Disposable; -import io.reactivex.rxjava3.schedulers.Schedulers; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.EOFException; -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.TimeUnit; - -public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { - private final KeywordLexicon lexicon; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final Disposable writerTask; - private RandomAccessFile raf; - private FileChannel channel; - - public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*128*8*4 + 8 * SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; - private final ByteBuffer byteBuffer; - private long pos; - - @SneakyThrows - public SearchIndexJournalWriterImpl(KeywordLexicon lexicon, File indexFile) { - this.lexicon = lexicon; - initializeIndexFile(indexFile); - - byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE); - - new Thread(this::journalWriterThread, "Journal Writer").start(); - - writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS); - Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite)); - } - - private void initializeIndexFile(File indexFile) throws IOException { - raf = new RandomAccessFile(indexFile, "rw"); - channel = raf.getChannel(); - - try { - pos = raf.readLong(); - raf.seek(pos); - logger.info("Resuming index file of size {}", pos); - } - catch (EOFException ex) { - logger.info("Clean index file"); - writePositionMarker(); - writePositionMarker(); - } - } - - private record WriteJob(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {} - private final LinkedBlockingQueue writeQueue = new LinkedBlockingQueue<>(512); - - @Override - @SneakyThrows - public void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) { - writeQueue.put(new WriteJob(header, entryData)); - } - - @SneakyThrows - public void journalWriterThread() { - - while (true) { - var job = writeQueue.take(); - - writeEntry(job.header, job.entryData); - } - } - private synchronized void writeEntry(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) { - - try { - byteBuffer.clear(); - - byteBuffer.putInt(entryData.size()); - byteBuffer.putInt(0); // unused - byteBuffer.putLong(header.documentId()); - byteBuffer.putLong(header.documentMeta()); - - entryData.write(byteBuffer); - - byteBuffer.limit(byteBuffer.position()); - byteBuffer.rewind(); - - while (byteBuffer.position() < byteBuffer.limit()) - channel.write(byteBuffer); - - writePositionMarker(); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - @Override - public synchronized void forceWrite() { - try { - channel.force(false); - } - catch (IOException ex) { - logger.error("IO Exception", ex); - } - } - - - @Override - @SneakyThrows - public void flushWords() { - lexicon.commitToDisk(); - } - - private void writePositionMarker() throws IOException { - pos = channel.size(); - raf.seek(0); - raf.writeLong(pos); - raf.writeLong(lexicon.size()); - raf.seek(pos); - } - - public synchronized void close() throws IOException { - writerTask.dispose(); - channel.close(); - raf.close(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySourceBehavior.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySourceBehavior.java deleted file mode 100644 index fc779403..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySourceBehavior.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.wmsa.edge.index.postings.reverse.query; - -public enum ReverseIndexEntrySourceBehavior { - DO_PREFER, - DO_NOT_PREFER -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryIf.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryIf.java deleted file mode 100644 index 0fd325ea..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryIf.java +++ /dev/null @@ -1,12 +0,0 @@ -package nu.marginalia.wmsa.edge.index.query; - -import java.util.stream.LongStream; - -public interface IndexQueryIf { - IndexQueryIf also(int wordId); - IndexQueryIf alsoCached(int wordId); - - IndexQueryIf not(int wordId); - - LongStream stream(); -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/OldReversePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/OldReversePageRankV2.java deleted file mode 100644 index 59fcda0d..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/OldReversePageRankV2.java +++ /dev/null @@ -1,261 +0,0 @@ -package nu.marginalia.wmsa.edge.index.ranking.old; - - -import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.list.TIntList; -import gnu.trove.list.array.TIntArrayList; -import gnu.trove.map.hash.TIntDoubleHashMap; -import gnu.trove.map.hash.TIntObjectHashMap; -import lombok.AllArgsConstructor; -import lombok.Data; -import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.sql.SQLException; -import java.util.*; - -public class OldReversePageRankV2 { - - private final TIntObjectHashMap domains = new TIntObjectHashMap<>(); - private final TIntObjectHashMap linkData = new TIntObjectHashMap<>(); - private final TIntObjectHashMap reverseLinkData = new TIntObjectHashMap<>(); - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public final Set originDomains = new HashSet<>(); - public final Set originDomainIds = new HashSet<>(); - - public static void main(String... args) throws IOException { - new OldReversePageRankV2( -// "wiki.xxiivv.com", -// "stpeter.im", -// "datagubbe.se", "midnight.pub", -// "www.gameboomers.com", -// "www.wild-seven.org", "iocane-powder.net", "www.doujinshi.org", "ohmydarling.org", -// "lobste.rs", -// "dataswamp.org", "www.ohtori.nu", -// "lukesmith.xyz", "internetgirlfriend.club", -// "tilde.town", "tilde.team", -// "felix.plesoianu.ro", -// "www.neustadt.fr", - "memex.marginalia.nu" - ); - } - - public OldReversePageRankV2(String... seedDomains) throws IOException { - loadDataFromFile(); - - long start = System.currentTimeMillis(); - for (int i = 0; i < 100; i++) { - if (domains.contains(i)) { - int[] ids = pageRank(10).toArray(); - System.out.printf("%d %d\n", i, ids.length); - } -// Arrays.stream(ids).mapToObj(domains::get).map(data -> -// String.format("%3d %2.2f %s", Optional.ofNullable(reverseLinkData.get(data.id)).map(TIntArrayList::size).orElse(0), data.quality, data.name) -// ).forEach(System.out::println); - } - long end = System.currentTimeMillis(); - System.out.printf("%2.2f", (end - start)/1000.0); - } - - public OldReversePageRankV2(HikariDataSource dataSource) { - originDomains.add("memex.marginalia.nu"); - - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE")) { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - domains.put(rsp.getInt(1), new DomainData("", 0.0, rsp.getInt(1), rsp.getInt(2), rsp.getInt(3))); - } - } - try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { - stmt.setFetchSize(10000); - - var rsp = stmt.executeQuery(); - - while (rsp.next()) { - int src = rsp.getInt(1); - int dst = rsp.getInt(2); - if (domains.contains(src) && domains.contains(dst) && domains.get(src).quality >= -5) { - if (!linkData.contains(src)) { - linkData.put(src, new TIntArrayList()); - } - linkData.get(src).add(dst); - } - } - } - - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setFetchSize(10000); - - for (var seed : this.originDomains) { - stmt.setString(1, seed); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - originDomainIds.add(rsp.getInt(1)); - } - } - } - - } catch (SQLException throwables) { - logger.error("SQL error", throwables); - } - - } - - public int size() { - return domains.size(); - } - - public TIntList pageRank(int resultCount) { - RankVector rank = new RankVector(1.d / domains.size()); - - for (int i = 0; i < 100; i++) { - RankVector newRank = createNewRankVector(rank); - - double oldNorm = rank.norm(); - double newNorm = newRank.norm(); - double dNorm = oldNorm - newNorm ; - originDomainIds.forEach(id -> newRank.increment(id, dNorm/oldNorm)); -// newRank.increment(14880, dNorm/rank.norm()); - rank = newRank; - } - - for (var id : originDomainIds) { - rank.increment(id, -1); - } - - return rank.getRanking(resultCount); - } - - @NotNull - private RankVector createNewRankVector(RankVector rank) { - - final TIntArrayList empty = new TIntArrayList(); - - double rankNorm = rank.norm(); - RankVector newRank = new RankVector(0); - - for (DomainData domain : domains.values(new DomainData[domains.size()])) { - - var links = Optional.ofNullable(linkData.get(domain.id)).orElse(empty); - if (links.size() > 0) { - double newRankValue = 0; - for (int linkedDomain : links.toArray()) { - newRankValue += rank.get(linkedDomain) / links.size(); - } - - newRank.set(domain.id, 0.85*newRankValue/rankNorm); - } - } - return newRank; - } - - private void loadDataFromFile() throws IOException { - - try (var str = Files.lines(Path.of("/home/vlofgren/Work/data-domains.txt"))) { - str.map(DomainData::new) - .filter(domain -> domain.indexed>1) - .filter(domain -> domain.state>=1) - .peek(domain -> { - if (originDomains.contains(domain.name)) { - originDomainIds.add(domain.id); - } - }) - .forEach(data -> domains.put(data.id, data)); - } - - try (var str = Files.lines(Path.of("/home/vlofgren/Work/data-links.txt"))) { - str.map(s->s.split("\\s+")).forEach(bits -> { - - int src = Integer.parseInt(bits[0]); - int dst = Integer.parseInt(bits[1]); - - if (domains.contains(src) && domains.contains(dst) && domains.get(src).quality >= -5) { - if (!linkData.contains(src)) { - linkData.put(src, new TIntArrayList()); - } - linkData.get(src).add(dst); - } - - - if (!reverseLinkData.contains(dst)) { - reverseLinkData.put(dst, new TIntArrayList()); - } - reverseLinkData.get(dst).add(src); - }); - } - } - - private class RankVector { - private final TIntDoubleHashMap rank; - private final double defaultValue; - public RankVector(double defaultValue) { - rank = new TIntDoubleHashMap(domains.size(), 0.75f, -1, defaultValue); - this.defaultValue = defaultValue; - } - - public void set(int id, double value) { - rank.put(id, value); - } - - - public void increment(int id, double value) { - rank.adjustOrPutValue(id, value, value); - } - - public double get(int id) { - return rank.get(id); - } - - public double norm() { - if (rank.isEmpty()) { - return defaultValue * domains.size(); - } - return Arrays.stream(rank.values()).map(Math::abs).sum(); - } - - public double norm(RankVector other) { - return Arrays.stream(rank.keys()).mapToDouble(k -> Math.abs(rank.get(k) - other.get(k))).sum(); - } - - public TIntList getRanking(int numResults) { - TIntArrayList list = new TIntArrayList(numResults); - - Comparator comparator = Comparator.comparing(e -> rank.get(e.id)); - - domains.valueCollection().stream() - .sorted(comparator.reversed()) - .map(DomainData::getId) - .limit(numResults) - .forEach(list::add); - - return list; - } - - } - @Data @AllArgsConstructor - static class DomainData { - - public DomainData(String str) { - String[] parts = str.split("\\s+"); - - id = Integer.parseInt(parts[0]); - quality = Double.parseDouble(parts[1]); - name = parts[2]; - indexed = Integer.parseInt(parts[3]); - state = Integer.parseInt(parts[4]); - } - public final String name; - public final double quality; - public final int id; - public final int indexed; - public final int state; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/StandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/StandardPageRank.java deleted file mode 100644 index cd58f7be..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/StandardPageRank.java +++ /dev/null @@ -1,266 +0,0 @@ -package nu.marginalia.wmsa.edge.index.ranking.old; - - -import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.list.TIntList; -import gnu.trove.list.array.TIntArrayList; -import gnu.trove.map.hash.TIntDoubleHashMap; -import gnu.trove.map.hash.TIntObjectHashMap; -import gnu.trove.set.hash.TIntHashSet; -import lombok.AllArgsConstructor; -import lombok.Data; -import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.sql.SQLException; -import java.util.*; -import java.util.function.IntToDoubleFunction; - -public class StandardPageRank { - - private final TIntObjectHashMap domains = new TIntObjectHashMap<>(); - private final TIntObjectHashMap linkData = new TIntObjectHashMap<>(); - private final TIntObjectHashMap reverseLinkData = new TIntObjectHashMap<>(); - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public final Set originDomains = new HashSet<>(); - public final Set originDomainIds = new HashSet<>(); - - public StandardPageRank(IntToDoubleFunction weight, String... seedDomains) throws IOException { - originDomains.addAll(Arrays.asList(seedDomains)); - loadDataFromFile(); - - int[] ids = pageRank(weight, 1000).toArray(); - Arrays.stream(ids).mapToObj(domains::get).map(data -> - String.format("%3d %2.2f %s", Optional.ofNullable(reverseLinkData.get(data.id)).map(TIntArrayList::size).orElse(0), data.quality, data.name) - ).forEach(System.out::println); - } - - public String domainNameFromId(int id) { - return domains.get(id).name; - } - - public StandardPageRank(HikariDataSource dataSource, String... origins) { - originDomains.addAll(Arrays.asList(origins)); - - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,DOMAIN_NAME FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE AND QUALITY>=-10")) { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - domains.put(rsp.getInt(1), new DomainData(rsp.getInt(1), rsp.getString(4), rsp.getInt(2), rsp.getInt(3), 0)); - } - } - try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { - stmt.setFetchSize(10000); - - var rsp = stmt.executeQuery(); - - while (rsp.next()) { - int src = rsp.getInt(1); - int dst = rsp.getInt(2); - - if (domains.contains(src) && domains.contains(dst) && domains.get(src).quality >= -5) { - if (!linkData.contains(src)) { - linkData.put(src, new TIntArrayList()); - } - linkData.get(src).add(dst); - - if (!reverseLinkData.contains(dst)) { - reverseLinkData.put(dst, new TIntArrayList()); - } - reverseLinkData.get(dst).add(src); - } - } - } - - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - for (var seed : this.originDomains) { - stmt.setString(1, seed); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - originDomainIds.add(rsp.getInt(1)); - } - } - } - - } catch (SQLException throwables) { - logger.error("SQL error", throwables); - } - - } - - public int size() { - return domains.size(); - } - - public TIntList pageRank(IntToDoubleFunction weight, int resultCount) { - RankVector rank = new RankVector(1.d / domains.size()); - - int iter_max = 100; - for (int i = 0; i < iter_max; i++) { - RankVector newRank = createNewRankVector(rank); - - double oldNorm = rank.norm(); - double newNorm = newRank.norm(); - double dNorm = oldNorm - newNorm; - if (i < iter_max-1) { - originDomainIds.forEach(id -> newRank.increment(id, dNorm/originDomainIds.size())); - newRank.incrementAll(0.14*dNorm/rank.size()); - } - logger.debug("{} {} {}", dNorm, newNorm, rank.norm(newRank)); - rank = newRank; - } - - - return rank.getRanking(weight, resultCount); - } - - @NotNull - private RankVector createNewRankVector(RankVector rank) { - - final TIntArrayList empty = new TIntArrayList(); - - RankVector newRank = new RankVector(0); - - for (DomainData domain : domains.valueCollection()) { - - var links = Optional.ofNullable(reverseLinkData.get(domain.id)).orElse(empty); - double newRankValue = 0; - if (links.size() > 0) { - for (int linkedDomain : links.toArray()) { - newRankValue += rank.get(linkedDomain) / linkData.get(linkedDomain).size(); - } - } - - newRank.set(domain.id, 0.85 * newRankValue); - } - return newRank; - } - - private void loadDataFromFile() throws IOException { - - try (var str = Files.lines(Path.of("/home/vlofgren/Work/data-domains.txt"))) { - str.map(DomainData::new) - .filter(domain -> domain.indexed>1) - .filter(domain -> domain.quality>=0.1) - .peek(domain -> { - if (originDomains.contains(domain.name)) { - originDomainIds.add(domain.id); - } - }) - .forEach(data -> domains.put(data.id, data)); - } - - try (var str = Files.lines(Path.of("/home/vlofgren/Work/data-links.txt"))) { - str.map(s->s.split("\\s+")).forEach(bits -> { - - int src = Integer.parseInt(bits[0]); - int dst = Integer.parseInt(bits[1]); - - if (domains.contains(src) && domains.contains(dst) && domains.get(src).quality >= -5) { - if (!linkData.contains(src)) { - linkData.put(src, new TIntArrayList()); - } - linkData.get(src).add(dst); - - if (!reverseLinkData.contains(dst)) { - reverseLinkData.put(dst, new TIntArrayList()); - } - reverseLinkData.get(dst).add(src); - } - }); - } - } - - private class RankVector { - private final TIntDoubleHashMap rank; - private final double defaultValue; - public RankVector(double defaultValue) { - rank = new TIntDoubleHashMap(domains.size(), 0.75f, -1, defaultValue); - this.defaultValue = defaultValue; - } - - public void set(int id, double value) { - rank.put(id, value); - } - - public void increment(int id, double value) { - rank.adjustOrPutValue(id, value, value); - } - - public double get(int id) { - return rank.get(id); - } - - public double norm() { - if (rank.isEmpty()) { - return defaultValue * domains.size(); - } - return Arrays.stream(rank.values()).map(Math::abs).sum(); - } - - public double norm(RankVector other) { - return Arrays.stream(rank.keys()).mapToDouble(k -> Math.abs(rank.get(k) - other.get(k))).sum(); - } - - public TIntList getRanking(IntToDoubleFunction other, int numResults) { - TIntArrayList list = new TIntArrayList(numResults); - - Comparator comparator = Comparator.comparing(e -> Math.sqrt(other.applyAsDouble(e.id) * rank.get(e.id))); - - domains.valueCollection().stream() - .sorted(comparator.reversed()) - .map(DomainData::getId) - .limit(numResults) - .forEach(list::add); - - return list; - } - - public TIntList getRanking2(int numResults) { - TIntArrayList list = new TIntArrayList(numResults); - - Comparator comparator = Comparator.comparing(e -> rank.get(e.id)); - - domains.valueCollection().stream() - .sorted(comparator.reversed()) - .map(DomainData::getId) - .limit(numResults) - .forEach(list::add); - - return list; - } - - public void incrementAll(double v) { - rank.transformValues(oldv -> oldv + v); - } - - int size() { - return domains.size(); - } - } - @Data @AllArgsConstructor - static class DomainData { - - public DomainData(String str) { - String[] parts = str.split("\\s+"); - - id = Integer.parseInt(parts[0]); - name = parts[2]; - indexed = Integer.parseInt(parts[3]); - state = Integer.parseInt(parts[4]); - quality = Double.parseDouble(parts[5]); - } - public final int id; - public final String name; - public final int indexed; - public final int state; - public double quality; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java deleted file mode 100644 index 90ac84a4..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java +++ /dev/null @@ -1,110 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc; - -import com.google.gson.Gson; -import com.google.inject.Inject; -import com.google.inject.Singleton; -import io.prometheus.client.Histogram; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.OffHeapDictionaryHashMap; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; -import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator; -import nu.marginalia.wmsa.edge.index.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeIdList; -import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; -import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; -import org.apache.http.HttpStatus; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import spark.HaltException; -import spark.Request; -import spark.Response; -import spark.Spark; - -import java.util.OptionalInt; - -import static spark.Spark.halt; - -@Singleton -public class EdgeIndexDomainQueryService { - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register(); - - private final Gson gson = GsonFactory.get(); - - private final SearchIndexControl indexes; - - @Inject - public EdgeIndexDomainQueryService(SearchIndexControl indexes) { - this.indexes = indexes; - } - - public Object searchDomain(Request request, Response response) { - if (indexes.getLexiconReader() == null) { - logger.warn("Dictionary reader not yet initialized"); - halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes"); - } - - String json = request.body(); - EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class); - - try { - return new EdgeDomainSearchResults("", new EdgeIdList<>()); - // fixme - // return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet)); - } - catch (HaltException ex) { - logger.warn("Halt", ex); - throw ex; - } - catch (Exception ex) { - logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json); - logger.info("Error", ex); - Spark.halt(500, "Error"); - return null; - } - } - - public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) { - - final OptionalInt wordId = lookUpWord(specsSet.keyword); - final EdgeIdList urlIds = new EdgeIdList<>(); - - final IndexSearchBudget budget = new IndexSearchBudget(50); - - if (wordId.isEmpty()) { - return new EdgeDomainSearchResults(specsSet.keyword, urlIds); - } - - LongQueryBuffer buffer = new LongQueryBuffer(512); - - - final IndexResultDomainDeduplicator localFilter = new IndexResultDomainDeduplicator(1); - var query = indexes.getIndex().getDomainQuery(wordId.getAsInt(), localFilter); - - while (query.hasMore() && urlIds.size() < specsSet.maxResults) { - query.getMoreResults(buffer); - - for (int i = 0; i < buffer.end && urlIds.size() < specsSet.maxResults; i++) { - long result = buffer.data[i]; - if (localFilter.test(result)) { - urlIds.add((int) (result & 0xFFFF_FFFFL)); - } - } - } - - return new EdgeDomainSearchResults(specsSet.keyword, urlIds); - } - - private OptionalInt lookUpWord(String s) { - int ret = indexes.getLexiconReader().get(s); - if (ret == OffHeapDictionaryHashMap.NO_VALUE) { - return OptionalInt.empty(); - } - return OptionalInt.of(ret); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java deleted file mode 100644 index 40f7cf64..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java +++ /dev/null @@ -1,126 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import com.google.protobuf.InvalidProtocolBufferException; -import nu.marginalia.util.dict.OffHeapDictionaryHashMap; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker; -import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; -import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeId; -import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq; -import org.apache.http.HttpStatus; -import spark.Request; -import spark.Response; - -import java.util.Arrays; - -@Singleton -public class EdgeIndexLexiconService { - - private final SearchIndexControl indexes; - private final KeywordLexicon keywordLexicon; - - @Inject - public EdgeIndexLexiconService(SearchIndexControl indexes, IndexServicesFactory servicesFactory) { - this.indexes = indexes; - this.keywordLexicon = servicesFactory.getKeywordLexicon(); - } - - public EdgeIndexLexiconService(SearchIndexControl indexes, KeywordLexicon lexicon) { - this.indexes = indexes; - this.keywordLexicon = lexicon; - } - - public Object getWordId(Request request, Response response) { - final String word = request.splat()[0]; - - var lr = indexes.getLexiconReader(); - if (null == lr) { - response.status(HttpStatus.SC_FAILED_DEPENDENCY); - return ""; - } - - final int wordId = lr.get(word); - - if (OffHeapDictionaryHashMap.NO_VALUE == wordId) { - response.status(404); - return ""; - } - - return wordId; - } - - public long getOrInsertWord(String word) { - return keywordLexicon.getOrInsert(word); - } - - public Object putWords(Request request, Response response) throws InvalidProtocolBufferException { - var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes()); - - EdgeId domainId = new EdgeId<>(req.getDomain()); - EdgeId urlId = new EdgeId<>(req.getUrl()); - int idx = req.getIndex(); - - for (int ws = 0; ws < req.getWordSetCount(); ws++) { - putWords(domainId, urlId, req.getWordSet(ws), idx); - } - - response.status(HttpStatus.SC_ACCEPTED); - return ""; - } - - public void putWords(int idx, SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entry) { - SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx); - - indexWriter.put(header, entry); - } - - public void putWords(EdgeId domainId, EdgeId urlId, - IndexPutKeywordsReq.WordSet words, int idx - ) { - - SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx); - - var wordArray = words.getWordsList().toArray(String[]::new); - var metaArray = words.getMetaList().stream().mapToLong(Long::valueOf).toArray(); - - DocumentKeywords documentKeywords = new DocumentKeywords(wordArray, metaArray); - for (var chunk : KeywordListChunker.chopList(documentKeywords, SearchIndexJournalEntry.MAX_LENGTH)) { - var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata())); - var header = new SearchIndexJournalEntryHeader(domainId, urlId, EdgePageDocumentsMetadata.defaultValue()); - - indexWriter.put(header, entry); - } - } - - private long[] getOrInsertWordIds(String[] words, long[] meta) { - long[] ids = new long[words.length*2]; - int putIdx = 0; - - for (int i = 0; i < words.length; i++) { - String word = words[i]; - - long id = keywordLexicon.getOrInsert(word); - if (id != OffHeapDictionaryHashMap.NO_VALUE) { - ids[putIdx++] = id; - ids[putIdx++] = meta[i]; - } - } - - if (putIdx != words.length*2) { - ids = Arrays.copyOf(ids, putIdx); - } - return ids; - } - - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexOpsService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexOpsService.java deleted file mode 100644 index bf1fd459..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexOpsService.java +++ /dev/null @@ -1,42 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; -import spark.Request; -import spark.Response; -import spark.Spark; - -@Singleton -public class EdgeIndexOpsService { - - private final SearchIndexControl indexes; - private final EdgeOpsLockService opsLockService; - private final EdgeIndexSearchSetsService searchSetService; - - @Inject - public EdgeIndexOpsService(SearchIndexControl indexes, - EdgeOpsLockService opsLockService, - EdgeIndexSearchSetsService searchSetService) { - this.indexes = indexes; - this.opsLockService = opsLockService; - this.searchSetService = searchSetService; - } - - public Object repartitionEndpoint(Request request, Response response) throws Exception { - - if (!opsLockService.run(searchSetService::recalculateAll)) { - Spark.halt(503, "Operations busy"); - } - return "OK"; - } - - public Object reindexEndpoint(Request request, Response response) throws Exception { - - if (!indexes.reindex()) { - Spark.halt(503, "Operations busy"); - } - return "OK"; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java deleted file mode 100644 index 5988df3b..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java +++ /dev/null @@ -1,305 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc; - -import com.google.gson.Gson; -import com.google.inject.Inject; -import com.google.inject.Singleton; -import gnu.trove.list.TLongList; -import gnu.trove.list.array.TLongArrayList; -import gnu.trove.set.hash.TLongHashSet; -import io.prometheus.client.Counter; -import io.prometheus.client.Gauge; -import io.prometheus.client.Histogram; -import it.unimi.dsi.fastutil.ints.IntArrayList; -import it.unimi.dsi.fastutil.ints.IntList; -import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.OffHeapDictionaryHashMap; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms; -import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator; -import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; -import nu.marginalia.wmsa.edge.index.query.IndexQuery; -import nu.marginalia.wmsa.edge.index.query.IndexQueryParams; -import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator; -import nu.marginalia.wmsa.edge.index.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet; -import nu.marginalia.wmsa.edge.index.svc.searchset.SmallSearchSet; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; -import org.apache.http.HttpStatus; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import spark.HaltException; -import spark.Request; -import spark.Response; -import spark.Spark; - -import java.util.ArrayList; -import java.util.List; -import java.util.OptionalInt; -import java.util.function.LongPredicate; - -import static java.util.Comparator.comparingDouble; -import static spark.Spark.halt; - -@Singleton -public class EdgeIndexQueryService { - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register(); - private static final Gauge wmsa_edge_index_query_cost = Gauge.build().name("wmsa_edge_index_query_cost").help("-").register(); - private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register(); - - private final Gson gson = GsonFactory.get(); - - private final SearchIndexControl indexes; - private final EdgeIndexSearchSetsService searchSetsService; - - @Inject - public EdgeIndexQueryService(SearchIndexControl indexes, EdgeIndexSearchSetsService searchSetsService) { - this.indexes = indexes; - this.searchSetsService = searchSetsService; - } - - public Object search(Request request, Response response) { - if (indexes.getLexiconReader() == null) { - logger.warn("Dictionary reader not yet initialized"); - halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes"); - } - - String json = request.body(); - EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class); - - try { - return wmsa_edge_index_query_time.time(() -> query(specsSet)); - } - catch (HaltException ex) { - logger.warn("Halt", ex); - throw ex; - } - catch (Exception ex) { - logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json); - logger.info("Error", ex); - Spark.halt(500, "Error"); - return null; - } - } - - - public EdgeSearchResultSet query(EdgeSearchSpecification specsSet) { - SearchQuery searchQuery = new SearchQuery(specsSet); - - List results = searchQuery.execute(); - - wmsa_edge_index_query_cost.set(searchQuery.getDataCost()); - - if (!searchQuery.hasTimeLeft()) { - wmsa_edge_index_query_timeouts.inc(); - } - - return new EdgeSearchResultSet(results); - } - - private class SearchQuery { - private final int fetchSize; - private final IndexSearchBudget budget; - private final List subqueries; - private long dataCost = 0; - private final IndexQueryParams queryParams; - - private final int limitByDomain; - private final int limitTotal; - - TLongHashSet consideredUrlIds; - - public SearchQuery(EdgeSearchSpecification specsSet) { - var limits = specsSet.queryLimits; - - this.fetchSize = limits.fetchSize(); - this.budget = new IndexSearchBudget(limits.timeoutMs()); - this.subqueries = specsSet.subqueries; - this.limitByDomain = limits.resultsByDomain(); - this.limitTotal = limits.resultsTotal(); - - this.consideredUrlIds = new TLongHashSet(fetchSize * 4); - - queryParams = new IndexQueryParams( - specsSet.quality, - specsSet.year, - specsSet.size, - specsSet.rank, - getSearchSet(specsSet), - specsSet.queryStrategy); - } - - private List execute() { - final TLongList results = new TLongArrayList(fetchSize); - - for (var sq : subqueries) { - final EdgeIndexQuerySearchTerms searchTerms = getSearchTerms(sq); - - if (searchTerms.isEmpty()) { - continue; - } - - TLongArrayList resultsForSubquery = performSearch(searchTerms); - results.addAll(resultsForSubquery); - - if (!budget.hasTimeLeft()) { - logger.info("Query timed out {}, ({}), -{}", - sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude); - break; - } - } - - final var evaluator = new IndexResultValuator(indexes, results, subqueries, queryParams); - - ArrayList items = new ArrayList<>(results.size()); - ArrayList refusedItems = new ArrayList<>(results.size()); - - // Sorting the result ids results in better paging characteristics - results.sort(); - - results.forEach(id -> { - var item = evaluator.evaluateResult(id); - - // Score value is zero when the best query variant consists of low-value terms that are just scattered - // throughout the document, with no indicators of importance associated with them. - if (item.getScoreValue() < 0) { - items.add(item); - } - else { - refusedItems.add(item); - } - - return true; - }); - - if (items.isEmpty()) { - items.addAll(refusedItems); - } - - return selectResults(items); - } - - - private TLongArrayList performSearch(EdgeIndexQuerySearchTerms terms) - { - final TLongArrayList results = new TLongArrayList(fetchSize); - final LongQueryBuffer buffer = new LongQueryBuffer(fetchSize); - - - IndexQuery query = getQuery(terms, queryParams, consideredUrlIds::add); - - while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) { - buffer.reset(); - query.getMoreResults(buffer); - - for (int i = 0; i < buffer.size() && results.size() < fetchSize; i++) { - results.add(buffer.data[i]); - } - } - - dataCost += query.dataCost(); - - return results; - } - - private SearchSet getSearchSet(EdgeSearchSpecification specsSet) { - - if (specsSet.domains != null && !specsSet.domains.isEmpty()) { - return new SmallSearchSet(specsSet.domains); - } - - return searchSetsService.getSearchSetByName(specsSet.searchSetIdentifier); - } - - private List selectResults(List results) { - - var domainCountFilter = new IndexResultDomainDeduplicator(limitByDomain); - - results.sort(comparingDouble(EdgeSearchResultItem::getScore) - .thenComparingInt(EdgeSearchResultItem::getRanking) - .thenComparingInt(EdgeSearchResultItem::getUrlIdInt)); - - List resultsList = new ArrayList<>(results.size()); - - for (var item : results) { - if (domainCountFilter.test(item)) { - resultsList.add(item); - } - } - - if (resultsList.size() > limitTotal) { - // This can't be made a stream limit() operation because we need domainCountFilter - // to run over the entire list to provide accurate statistics - - resultsList.subList(limitTotal, resultsList.size()).clear(); - } - - for (var result : resultsList) { - result.resultsFromDomain = domainCountFilter.getCount(result); - } - - return resultsList; - } - - private IndexQuery getQuery(EdgeIndexQuerySearchTerms terms, IndexQueryParams params, LongPredicate includePred) { - return indexes.getIndex().getQuery(terms, params, includePred); - } - - public boolean hasTimeLeft() { - return budget.hasTimeLeft(); - } - - public long getDataCost() { - return dataCost; - } - - } - - private EdgeIndexQuerySearchTerms getSearchTerms(EdgeSearchSubquery request) { - final IntList excludes = new IntArrayList(); - final IntList includes = new IntArrayList(); - final IntList priority = new IntArrayList(); - - for (var include : request.searchTermsInclude) { - var word = lookUpWord(include); - if (word.isEmpty()) { - logger.debug("Unknown search term: " + include); - return new EdgeIndexQuerySearchTerms(); - } - includes.add(word.getAsInt()); - } - - for (var advice : request.searchTermsAdvice) { - var word = lookUpWord(advice); - if (word.isEmpty()) { - logger.debug("Unknown search term: " + advice); - return new EdgeIndexQuerySearchTerms(); - } - includes.add(word.getAsInt()); - } - - for (var exclude : request.searchTermsExclude) { - lookUpWord(exclude).ifPresent(excludes::add); - } - for (var exclude : request.searchTermsPriority) { - lookUpWord(exclude).ifPresent(priority::add); - } - - return new EdgeIndexQuerySearchTerms(includes, excludes, priority); - } - - - private OptionalInt lookUpWord(String s) { - int ret = indexes.getLexiconReader().get(s); - if (ret == OffHeapDictionaryHashMap.NO_VALUE) { - return OptionalInt.empty(); - } - return OptionalInt.of(ret); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeOpsLockService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeOpsLockService.java deleted file mode 100644 index 99dbd5fb..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeOpsLockService.java +++ /dev/null @@ -1,42 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc; - -import javax.annotation.CheckReturnValue; -import javax.inject.Singleton; -import java.util.Optional; -import java.util.concurrent.Callable; -import java.util.concurrent.locks.ReentrantLock; - -@Singleton -public class EdgeOpsLockService { - public ReentrantLock opsLock = new ReentrantLock(); - - @CheckReturnValue - public Optional run(Callable c) throws Exception { - if (!opsLock.tryLock()) - return Optional.empty(); - try { - return Optional.of(c.call()); - } - finally { - opsLock.unlock(); - } - } - - - @CheckReturnValue - public boolean run(Runnable r) { - if (!opsLock.tryLock()) - return false; - try { - r.run(); - return true; - } - finally { - opsLock.unlock(); - } - } - - public boolean isLocked() { - return opsLock.isLocked(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParser.java deleted file mode 100644 index cc02196c..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParser.java +++ /dev/null @@ -1,29 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.arxiv; - -import com.google.gson.Gson; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.edge.integration.arxiv.model.ArxivMetadata; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.ArrayList; -import java.util.List; - -public class ArxivParser { - private final Gson gson = GsonFactory.get(); - - public ArxivParser() { - - } - - public List parse(File jsonFile) throws IOException { - - List ret = new ArrayList<>(); - try (var lines = Files.lines(jsonFile.toPath())) { - lines.map(line -> gson.fromJson(line, ArxivMetadata.class)).forEach(ret::add); - } - - return ret; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/arxiv/model/ArxivMetadata.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/arxiv/model/ArxivMetadata.java deleted file mode 100644 index ba6307a8..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/arxiv/model/ArxivMetadata.java +++ /dev/null @@ -1,21 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.arxiv.model; - -import com.google.gson.annotations.SerializedName; -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.NoArgsConstructor; - -@Getter -@AllArgsConstructor @NoArgsConstructor -public class ArxivMetadata { - public String id; - public String submitter; - public String authors; - public String title; - @SerializedName("abstract") - public String _abstract; - - public String getAbstract() { - return _abstract; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/model/BasicDocumentData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/model/BasicDocumentData.java deleted file mode 100644 index 4e620fc3..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/model/BasicDocumentData.java +++ /dev/null @@ -1,22 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.model; - -import lombok.AllArgsConstructor; -import lombok.Data; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; - - -@Data -@AllArgsConstructor -public class BasicDocumentData { - public final EdgeUrl url; - - public final String title; - public final String description; - public int hashCode; - - public final EdgePageWords words; - public final EdgeDomainLink[] domainLinks; - public final int wordCount; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java deleted file mode 100644 index 3c8bda78..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java +++ /dev/null @@ -1,83 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.stackoverflow; - -import com.google.inject.Inject; -import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.util.language.processing.model.KeywordMetadata; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; -import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; -import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink; -import org.apache.commons.lang3.StringUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; - -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; - -public class StackOverflowPostProcessor { - private final LinkParser linkParser = new LinkParser(); - - private final SentenceExtractor sentenceExtractor; - private final DocumentKeywordExtractor documentKeywordExtractor; - - @Inject - public StackOverflowPostProcessor(SentenceExtractor sentenceExtractor, DocumentKeywordExtractor documentKeywordExtractor) { - this.sentenceExtractor = sentenceExtractor; - this.documentKeywordExtractor = documentKeywordExtractor; - } - - public BasicDocumentData process(StackOverflowPost post) { - - final var docUrl = post.getUrl(); - final var doc = Jsoup.parseBodyFragment(""+post.getTitle()+"" + post.getFullBody()); - - EdgeDomainLink[] domainLinks = getDomainLinks(docUrl, doc); - - for (var tag : doc.getElementsByTag("code")) { - if (tag.text().length() > 32) { - tag.remove(); - } - } - - var dld = sentenceExtractor.extractSentences(doc); - var keywords = documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata()); - - keywords.addJustNoMeta("site:"+post.getUrl().domain); - keywords.addJustNoMeta("site:"+post.getUrl().domain); - keywords.addJustNoMeta("special:wikipedia"); - keywords.addJustNoMeta("special:wikipedia"); - keywords.addJustNoMeta("js:true"); - - String title = StringUtils.abbreviate(post.getTitle(), 255); - String description = StringUtils.abbreviate(Jsoup.parseBodyFragment(post.getJustBody()).text(), 255); - - return new BasicDocumentData(docUrl, title, description, post.fullBody.hashCode(), keywords, domainLinks, - dld.totalNumWords()); - - } - - private EdgeDomainLink[] getDomainLinks(EdgeUrl docUrl, Document doc) { - List links = new ArrayList<>(10); - - for (var tag : doc.getElementsByTag("a")) { - if (!tag.hasAttr("href")) { - continue; - } - String href = tag.attr("href"); - if (href.length()<10 || !href.contains(".") || !href.contains("://")) { - continue; - } - - linkParser.parseLink(docUrl, tag) - .filter(url -> !Objects.equals(docUrl.getDomain(), url.getDomain())) - .ifPresent(links::add); - } - - return links.stream().map(EdgeUrl::getDomain).map(domain -> new EdgeDomainLink(docUrl.domain, domain)) - .distinct().toArray(EdgeDomainLink[]::new); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java deleted file mode 100644 index 88921be1..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java +++ /dev/null @@ -1,123 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.stackoverflow; - -import gnu.trove.map.hash.TIntObjectHashMap; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost; -import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowQuestionData; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; -import java.util.ArrayList; -import java.util.Deque; -import java.util.LinkedList; -import java.util.function.Consumer; - -public class StackOverflowPostsReader extends DefaultHandler { - private static final int MAX_QUESTION_WINDOW_SIZE = 10_000; - - private final Thread runThread; - private final String postsFile; - private final EdgeDomain domain; - private final Consumer postConsumer; - - private final Deque questionWindow = new LinkedList<>(); - private final TIntObjectHashMap questionsById = new TIntObjectHashMap<>(1_000_000); - - public StackOverflowPostsReader(String postsFile, EdgeDomain domain, Consumer postConsumer) { - this.postsFile = postsFile; - this.domain = domain; - this.postConsumer = postConsumer; - runThread = new Thread(this::run, "StackOverflowPostReader"); - runThread.start(); - - } - - @Override - public void startElement(String uri, String lName, String qName, Attributes attr) throws SAXException { - if (!"row".equals(qName)) { - return; - } - - if ("1".equals(attr.getValue("PostTypeId"))) { - onQuestion(attr); - } - if ("2".equals(attr.getValue("PostTypeId"))) { - onReply(attr); - } - - while (questionWindow.size() > MAX_QUESTION_WINDOW_SIZE) { - var data = questionWindow.removeFirst(); - finalizeQuestion(data); - } - - } - - private void finalizeQuestion(StackOverflowQuestionData data) { - questionsById.remove(data.getId()); - var post = createPost(data); - postConsumer.accept(post); - } - - private StackOverflowPost createPost(StackOverflowQuestionData data) { - EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId(), null); - - StringBuilder body = new StringBuilder(); - body.append(data.getQuestion()); - data.getReplies().forEach(body::append); - - return new StackOverflowPost(url, data.getTitle(), body.toString(), data.getQuestion()); - } - - - private void onQuestion(Attributes attr) { - String id = attr.getValue("Id"); - String title = attr.getValue("Title"); - String body = attr.getValue("Body"); - String score = attr.getValue("Score"); - if (parseInt(score) < 0) - return; - - var data = new StackOverflowQuestionData(parseInt(id), title, body, new ArrayList<>()); - questionsById.put(data.getId(), data); - questionWindow.addLast(data); - } - - private void onReply(Attributes attr) { - String parentId = attr.getValue("ParentId"); - String body = attr.getValue("Body"); - String score = attr.getValue("Score"); - if (parseInt(score) < 0) - return; - - var data = questionsById.get(parseInt(parentId)); - if (data != null) { - data.getReplies().add(body); - } - } - - private int parseInt(String id) { - return Integer.parseInt(id); - } - - @SneakyThrows - private void run() { - SAXParserFactory factory = SAXParserFactory.newInstance(); - SAXParser saxParser = factory.newSAXParser(); - - saxParser.parse(postsFile, this); - - while (!questionWindow.isEmpty()) { - var data = questionWindow.removeFirst(); - finalizeQuestion(data); - } - } - - public void join() throws InterruptedException { - runThread.join(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/model/StackOverflowPost.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/model/StackOverflowPost.java deleted file mode 100644 index 03cc1c90..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/model/StackOverflowPost.java +++ /dev/null @@ -1,14 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.stackoverflow.model; - -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.ToString; -import nu.marginalia.wmsa.edge.model.EdgeUrl; - -@Data @AllArgsConstructor @ToString -public class StackOverflowPost { - public EdgeUrl url; - public String title; - public String fullBody; - public String justBody; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/model/StackOverflowQuestionData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/model/StackOverflowQuestionData.java deleted file mode 100644 index 52e2ff6e..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/model/StackOverflowQuestionData.java +++ /dev/null @@ -1,14 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.stackoverflow.model; - -import lombok.AllArgsConstructor; -import lombok.Data; - -import java.util.List; - -@Data @AllArgsConstructor -public class StackOverflowQuestionData { - int id; - String title; - String question; - List replies; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java deleted file mode 100644 index 67c51751..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java +++ /dev/null @@ -1,81 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.wikipedia; - -import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.util.language.processing.model.KeywordMetadata; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; -import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; -import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink; -import org.apache.commons.lang3.StringUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; - -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; - -public class WikipediaProcessor { - private final LinkParser linkParser = new LinkParser(); - - private final SentenceExtractor sentenceExtractor; - private final DocumentKeywordExtractor documentKeywordExtractor; - - public WikipediaProcessor(SentenceExtractor sentenceExtractor, DocumentKeywordExtractor documentKeywordExtractor) { - this.sentenceExtractor = sentenceExtractor; - this.documentKeywordExtractor = documentKeywordExtractor; - } - - - public BasicDocumentData process(WikipediaArticle post) { - - final var docUrl = post.getUrl(); - final var doc = Jsoup.parseBodyFragment(post.body); - - String title = StringUtils.abbreviate(doc.getElementsByTag("title").text(), 255); - String description = getSummary(doc); - - EdgeDomainLink[] domainLinks = getDomainLinks(docUrl, doc); - - var dld = sentenceExtractor.extractSentences(doc); - var keywords = documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata()); - - keywords.addJustNoMeta("site:"+post.getUrl().domain); - keywords.addJustNoMeta("special:stackoverflow"); - keywords.addJustNoMeta("special:stackoverflow"); - keywords.addJustNoMeta("js:true"); - - return new BasicDocumentData(docUrl, title, description, post.body.hashCode(), keywords, domainLinks, - dld.totalNumWords()); - - } - - private String getSummary(Document doc) { - doc = doc.clone(); - doc.select("table,sup,.reference").remove(); - return StringUtils.abbreviate(doc.select("#bodyContent p").text(), 255); - } - - private EdgeDomainLink[] getDomainLinks(EdgeUrl docUrl, Document doc) { - List links = new ArrayList<>(10); - - for (var tag : doc.getElementsByTag("a")) { - if (!tag.hasAttr("href")) { - continue; - } - String href = tag.attr("href"); - if (href.length()<10 || !href.contains(".") || !href.contains("://")) { - continue; - } - - linkParser.parseLink(docUrl, tag) - .filter(url -> !Objects.equals(docUrl.getDomain(), url.getDomain())) - .ifPresent(links::add); - } - - return links.stream().map(EdgeUrl::getDomain).map(domain -> new EdgeDomainLink(docUrl.domain, domain)) - .distinct().toArray(EdgeDomainLink[]::new); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java deleted file mode 100644 index fa5904c9..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java +++ /dev/null @@ -1,46 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.wikipedia; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import org.openzim.ZIMTypes.ZIMFile; -import org.openzim.ZIMTypes.ZIMReader; - -import java.util.function.Consumer; - -public class WikipediaReader { - - private final Thread runThread; - private final String zimFile; - private final EdgeDomain domain; - private final Consumer postConsumer; - - public WikipediaReader(String zimFile, EdgeDomain domain, Consumer postConsumer) { - this.zimFile = zimFile; - this.domain = domain; - this.postConsumer = postConsumer; - - runThread = new Thread(this::run, "WikipediaReader"); - runThread.start(); - } - - @SneakyThrows - private void run() { - var zr = new ZIMReader(new ZIMFile(zimFile)); - - zr.forEachArticles((originalUrl, art) -> { - if (art != null) { - postConsumer.accept(new WikipediaArticle(synthesizeUrl(originalUrl), art)); - } - }, p -> true); - } - - private EdgeUrl synthesizeUrl(String originalUrl) { - return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl, null); - } - - public void join() throws InterruptedException { - runThread.join(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/model/WikipediaArticle.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/model/WikipediaArticle.java deleted file mode 100644 index bc221492..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/model/WikipediaArticle.java +++ /dev/null @@ -1,12 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.wikipedia.model; - -import lombok.AllArgsConstructor; -import lombok.Data; -import nu.marginalia.wmsa.edge.model.EdgeUrl; - -@Data -@AllArgsConstructor -public class WikipediaArticle { - public final EdgeUrl url; - public final String body; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/WideHashable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/WideHashable.java deleted file mode 100644 index 3b95711d..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/WideHashable.java +++ /dev/null @@ -1,5 +0,0 @@ -package nu.marginalia.wmsa.edge.model; - -public interface WideHashable { - long wideHash(); -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeIndexTask.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeIndexTask.java deleted file mode 100644 index 1212bfa9..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeIndexTask.java +++ /dev/null @@ -1,33 +0,0 @@ -package nu.marginalia.wmsa.edge.model.crawl; - -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.ToString; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; - -import java.util.ArrayList; -import java.util.List; -import java.util.stream.Stream; - -@Getter @AllArgsConstructor @ToString -public class EdgeIndexTask { - public final EdgeDomain domain; - public final List visited = new ArrayList<>(); - public final List urls = new ArrayList<>(); - public final int pass; - public final int limit; - public double rank; - - public boolean isEmpty() { - return domain == null || urls.isEmpty(); - } - - public Stream streamUrls() { - return urls.stream(); - } - - public int size() { - return urls.size(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageContent.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageContent.java deleted file mode 100644 index 596d389a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageContent.java +++ /dev/null @@ -1,25 +0,0 @@ -package nu.marginalia.wmsa.edge.model.crawl; - -import lombok.Data; -import nu.marginalia.wmsa.edge.model.EdgeUrl; - -import java.util.Map; -import java.util.Set; - -@Data -public class EdgePageContent { - public final EdgeUrl url; - public final EdgePageWords words; - public final Map> linkWords; - public final EdgePageMetadata metadata; - public final int hash; - public final String ipAddress; - - public boolean hasHotLink(EdgeUrl url) { - return linkWords.containsKey(url); - } - - public int numWords() { - return metadata.totalWords; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageMetadata.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageMetadata.java deleted file mode 100644 index bb192f9e..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageMetadata.java +++ /dev/null @@ -1,50 +0,0 @@ -package nu.marginalia.wmsa.edge.model.crawl; - -import lombok.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@AllArgsConstructor @EqualsAndHashCode @Getter @Setter @ToString @With -public class EdgePageMetadata { - public final int features; - public final int scriptTags; - public final int rawLength; - public final int textBodyLength; - public final int textDistinctWords; - public final String title; - public final String description; - public final double smutCoefficient; - public final int totalWords; - public final EdgeHtmlStandard htmlStandard; - private static final Logger logger = LoggerFactory.getLogger(EdgePageMetadata.class); - private static EdgePageMetadata _empty - = new EdgePageMetadata(0, 0, - 0, - 0, - 0, - "", - "", - 0., - 1, - EdgeHtmlStandard.UNKNOWN); - public static EdgePageMetadata empty() { - return _empty; - } - - public double quality() { - if (rawLength == 0 || textBodyLength == 0) { - return -5.; - } - -/* double dictionaryFactor = textDistinctWords / 10000.; - if (dictionaryFactor < 0.1) { - dictionaryFactor = 0; - }*/ - - return Math.log(textBodyLength / (double) rawLength)*htmlStandard.scale - + htmlStandard.offset - - scriptTags - // - dictionaryFactor - - smutCoefficient; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeRawPageContents.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeRawPageContents.java deleted file mode 100644 index 8bc5c8d2..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeRawPageContents.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.wmsa.edge.model.crawl; - -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.Getter; -import nu.marginalia.wmsa.edge.model.EdgeUrl; - -@Data @Getter @AllArgsConstructor -public class EdgeRawPageContents { - public final EdgeUrl url; - public final EdgeUrl redirectUrl; - public final String data; - public final EdgeContentType contentType; - public final String ip; - public boolean hasCookies; - public final String fetchTimestamp; - - public boolean isAfter(String dateIso8601) { - if (fetchTimestamp == null) { - return false; - } - return fetchTimestamp.compareTo(dateIso8601) >= 0; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeRobotsTxt.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeRobotsTxt.java deleted file mode 100644 index 5a8bfaea..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeRobotsTxt.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.wmsa.edge.model.crawl; - -import lombok.*; -import nu.marginalia.wmsa.edge.model.EdgeDomain; - -@AllArgsConstructor @EqualsAndHashCode @Getter @Setter @Builder -public class EdgeRobotsTxt { - public final EdgeDomain domain; - public final String robotsTxt; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeUrlVisit.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeUrlVisit.java deleted file mode 100644 index 07d7492e..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeUrlVisit.java +++ /dev/null @@ -1,21 +0,0 @@ -package nu.marginalia.wmsa.edge.model.crawl; - -import lombok.Data; -import nu.marginalia.wmsa.edge.model.EdgeUrl; - -@Data -public class EdgeUrlVisit { - public final EdgeUrl url; - public final Integer data_hash_code; - public final Double quality; - public final String title; - public final String description; - public final String ipAddress; - public final String format; - public final int features; - - public final int wordCountDistinct; - public final int wordCountTotal; - - public final EdgeUrlState urlState; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultsKey.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultsKey.java deleted file mode 100644 index aefee330..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultsKey.java +++ /dev/null @@ -1,13 +0,0 @@ -package nu.marginalia.wmsa.edge.model.search; - -import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; -import lombok.Getter; - -@EqualsAndHashCode -@AllArgsConstructor -@Getter -public class EdgeSearchResultsKey { - public final int bucket; - public final int searchTermCount; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchMain.java deleted file mode 100644 index 66347d2d..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchMain.java +++ /dev/null @@ -1,38 +0,0 @@ -package nu.marginalia.wmsa.edge.search; - -import com.google.inject.Guice; -import com.google.inject.Inject; -import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.configuration.server.Initialization; -import spark.Spark; - -import java.io.IOException; - -public class EdgeSearchMain extends MainClass { - private final EdgeSearchService service; - - @Inject - public EdgeSearchMain(EdgeSearchService service) { - this.service = service; - } - - public static void main(String... args) { - init(ServiceDescriptor.EDGE_SEARCH, args); - - Spark.staticFileLocation("/static/edge/"); - - Injector injector = Guice.createInjector( - new EdgeSearchModule(), - new ConfigurationModule(), - new DatabaseModule() - ); - - injector.getInstance(EdgeSearchMain.class); - injector.getInstance(Initialization.class).setReady(); - - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/IndexCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/IndexCommand.java deleted file mode 100644 index 46a8a437..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/IndexCommand.java +++ /dev/null @@ -1,40 +0,0 @@ -package nu.marginalia.wmsa.edge.search.command; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.search.model.BrowseResultSet; -import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; -import spark.Request; -import spark.Response; - -import java.io.IOException; - -@Singleton -public class IndexCommand { - - private final EdgeDataStoreDao dataStoreDao; - private final BrowseResultCleaner browseResultCleaner; - private final MustacheRenderer template; - private final EdgeDomainBlacklist blacklist; - @Inject - public IndexCommand(EdgeDataStoreDao dataStoreDao, RendererFactory rendererFactory, BrowseResultCleaner browseResultCleaner, EdgeDomainBlacklist blacklist) throws IOException { - this.dataStoreDao = dataStoreDao; - this.browseResultCleaner = browseResultCleaner; - - template = rendererFactory.renderer("edge/index"); - this.blacklist = blacklist; - } - - public String render(Request request, Response response) { - response.header("Cache-control", "public,max-age=3600"); - - var results = dataStoreDao.getRandomDomains(5, blacklist, 0); - results.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); - - return template.render(new BrowseResultSet(results.stream().limit(1).toList())); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchParameters.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchParameters.java deleted file mode 100644 index ff5662ea..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchParameters.java +++ /dev/null @@ -1,9 +0,0 @@ -package nu.marginalia.wmsa.edge.search.command; - -import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; - -public record SearchParameters(EdgeSearchProfile profile, SearchJsParameter js, boolean detailedResults) { - public String profileStr() { - return profile.name; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/ConvertCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/ConvertCommand.java deleted file mode 100644 index 933d93e1..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/ConvertCommand.java +++ /dev/null @@ -1,35 +0,0 @@ -package nu.marginalia.wmsa.edge.search.command.commands; - -import com.google.inject.Inject; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; -import nu.marginalia.wmsa.edge.search.command.SearchParameters; -import nu.marginalia.wmsa.edge.search.svc.EdgeSearchUnitConversionService; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; - -import java.io.IOException; -import java.util.Map; -import java.util.Optional; - -public class ConvertCommand implements SearchCommandInterface { - private final EdgeSearchUnitConversionService edgeSearchUnitConversionService; - private final MustacheRenderer> conversionRenderer; - - @Inject - public ConvertCommand(EdgeSearchUnitConversionService edgeSearchUnitConversionService, RendererFactory rendererFactory) throws IOException { - this.edgeSearchUnitConversionService = edgeSearchUnitConversionService; - - conversionRenderer = rendererFactory.renderer("edge/conversion-results"); - } - - @Override - public Optional process(Context ctx, SearchParameters parameters, String query) { - var conversion = edgeSearchUnitConversionService.tryConversion(ctx, query); - if (conversion.isEmpty()) { - return Optional.empty(); - } - - return Optional.of(conversionRenderer.render(Map.of("query", query, "result", conversion.get(), "profile", parameters.profileStr()))); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java deleted file mode 100644 index ccbb91ec..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java +++ /dev/null @@ -1,59 +0,0 @@ -package nu.marginalia.wmsa.edge.search.command.commands; - -import com.google.inject.Inject; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.search.EdgeSearchOperator; -import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; -import nu.marginalia.wmsa.edge.search.command.SearchParameters; -import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults; -import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; -import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; - -import java.io.IOException; -import java.util.Optional; - -public class SearchCommand implements SearchCommandInterface { - private final EdgeDomainBlacklist blacklist; - private final EdgeDataStoreDao dataStoreDao; - private final EdgeSearchOperator searchOperator; - private final MustacheRenderer searchResultsRenderer; - private final BrowseResultCleaner browseResultCleaner; - - public static final int MAX_DOMAIN_RESULTS = 3; - - @Inject - public SearchCommand(EdgeDomainBlacklist blacklist, - EdgeDataStoreDao dataStoreDao, - EdgeSearchOperator searchOperator, - RendererFactory rendererFactory, - BrowseResultCleaner browseResultCleaner - ) throws IOException { - this.blacklist = blacklist; - this.dataStoreDao = dataStoreDao; - this.searchOperator = searchOperator; - this.browseResultCleaner = browseResultCleaner; - - searchResultsRenderer = rendererFactory.renderer("edge/search-results"); - } - - @Override - public Optional process(Context ctx, SearchParameters parameters, String query) { - - EdgeUserSearchParameters params = new EdgeUserSearchParameters(query, parameters.profile(), parameters.js()); - DecoratedSearchResults results = searchOperator.doSearch(ctx, params); - - results.results.removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain))); - - results.domainResults.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); - - if (results.domainResults.size() > MAX_DOMAIN_RESULTS) { - results.domainResults.subList(MAX_DOMAIN_RESULTS, results.domainResults.size()).clear(); - } - - return Optional.of(searchResultsRenderer.render(results)); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java deleted file mode 100644 index 5551a67a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java +++ /dev/null @@ -1,529 +0,0 @@ -package nu.marginalia.wmsa.edge.search.query; - -import lombok.EqualsAndHashCode; -import lombok.ToString; -import nu.marginalia.util.TransformList; -import nu.marginalia.util.language.WordPatterns; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.function.Predicate; -import java.util.regex.Pattern; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import static java.util.stream.Stream.concat; - -public class QueryParser { - private static final Logger logger = LoggerFactory.getLogger(QueryParser.class); - - private final EnglishDictionary englishDictionary; - private final QueryVariants queryVariants; - - public QueryParser(EnglishDictionary englishDictionary, QueryVariants queryVariants) { - this.englishDictionary = englishDictionary; - this.queryVariants = queryVariants; - } - - public List parse(String query) { - List basicTokens = extractBasicTokens(query); - - TransformList list = new TransformList<>(basicTokens); - - list.transformEach(QueryParser::handleQuoteTokens); - list.transformEach(QueryParser::trimLiterals); - list.transformEachPair(QueryParser::createNegatedTerms); - list.transformEachPair(QueryParser::createPriorityTerms); - list.transformEach(QueryParser::handleSpecialOperations); - list.scanAndTransform(TokenType.LPAREN, TokenType.RPAREN, QueryParser::handleAdvisoryTerms); - - return list.getBackingList(); - } - - private static void handleQuoteTokens(TransformList.Entity entity) { - var t = entity.value; - if (t.type == TokenType.QUOT) { - entity.replace(new Token(TokenType.QUOT_TERM, - t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER), - t.displayStr)); - } - } - - private static void trimLiterals(TransformList.Entity entity) { - var t = entity.value; - - if (t.type == TokenType.LITERAL_TERM - && (t.str.endsWith(":") || t.str.endsWith(".")) - && t.str.length() > 1) { - entity.replace(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length() - 1), t.displayStr)); - } - - } - - private static void createNegatedTerms(TransformList.Entity first, TransformList.Entity second) { - var t = first.value; - var tn = second.value; - - if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) { - first.remove(); - second.replace(new Token(TokenType.EXCLUDE_TERM, tn.str, "-" + tn.str)); - } - } - private static void createPriorityTerms(TransformList.Entity first, TransformList.Entity second) { - var t = first.value; - var tn = second.value; - - if (t.type == TokenType.QMARK && tn.type == TokenType.LITERAL_TERM) { - first.remove(); - second.replace(new Token(TokenType.PRIORTY_TERM, tn.str, "?" + tn.str)); - } - } - private static void handleSpecialOperations(TransformList.Entity entity) { - var t = entity.value; - if (t.type == TokenType.LITERAL_TERM) { - if (t.str.startsWith("q") && t.str.matches("q[=><]\\d+")) { - entity.replace(new Token(TokenType.QUALITY_TERM, t.str.substring(1), t.displayStr)); - } else if (t.str.startsWith("near:")) { - entity.replace(new Token(TokenType.NEAR_TERM, t.str.substring(5), t.displayStr)); - } else if (t.str.startsWith("year") && t.str.matches("year[=><]\\d{4}")) { - entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) { - entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) { - entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("qs=")) { - entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr)); - } else if (t.str.contains(":")) { - entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr)); - } - } - } - - private static void handleAdvisoryTerms(TransformList.Entity entity) { - var t = entity.value; - if (t.type == TokenType.LPAREN) { - entity.remove(); - } else if (t.type == TokenType.RPAREN) { - entity.remove(); - } else if (t.type == TokenType.LITERAL_TERM) { - entity.replace(new Token(TokenType.ADVICE_TERM, t.str, "(" + t.str + ")")); - } - } - - private static final Pattern noisePattern = Pattern.compile("[,]"); - - public List extractBasicTokens(String rawQuery) { - List tokens = new ArrayList<>(); - - String query = noisePattern.matcher(rawQuery).replaceAll(" "); - - for (int i = 0; i < query.length(); i++) { - int chr = query.charAt(i); - - if ('(' == chr) { - tokens.add(new Token(TokenType.LPAREN, query.substring(i, i+1).toLowerCase(), query.substring(i, i+1))); - } - else if (')' == chr) { - tokens.add(new Token(TokenType.RPAREN, query.substring(i, i+1).toLowerCase(), query.substring(i, i+1))); - } - else if ('"' == chr) { - int end = query.indexOf('"', i+1); - if (end == -1) { - end = query.length(); - } - tokens.add(new Token(TokenType.QUOT, - query.substring(i+1, end).toLowerCase(), - query.substring(i, Math.min(query.length(), end+1)))); - i = end; - } - else if ('\u201C' == chr) { - int end = query.indexOf('\u201D', i+1); - if (end == -1) { - end = query.length(); - } - tokens.add(new Token(TokenType.QUOT, - query.substring(i+1, end).toLowerCase(), - query.substring(i, Math.min(query.length(), end+1)))); - i = end; - } - else if ('-' == chr) { - tokens.add(new Token(TokenType.MINUS, "-")); - } - else if ('?' == chr) { - tokens.add(new Token(TokenType.QMARK, "?")); - } - else if (Character.isSpaceChar(chr)) { - // - } - else { - - int end = i+1; - for (; end < query.length(); end++) { - if (query.charAt(end) == ' ' || query.charAt(end) == ')') - break; - } - tokens.add(new Token(TokenType.LITERAL_TERM, - query.substring(i, end).toLowerCase(), - query.substring(i, end))); - i = end-1; - } - } - return tokens; - } - - - public List> variantQueries(List items) { - int start = -1; - int end = items.size(); - - for (int i = 0; i < items.size(); i++) { - var token = items.get(i); - - if (start < 0) { - if (token.type == TokenType.LITERAL_TERM && WordPatterns.wordQualitiesPredicate.test(token.str)) { - start = i; - } - } - else { - if (token.type != TokenType.LITERAL_TERM || !WordPatterns.wordPredicateEither.test(token.str)) { - end = i; - break; - } - } - } - - if (start >= 0 && end - start > 1) { - List> variantParts = getVariantSearchTerms(items.subList(start, end)); - int s = start; - int e = end; - return variantParts.stream().map(part -> - concat(items.subList(0, s).stream(), concat(part.stream(), items.subList(e, items.size()).stream())) - .collect(Collectors.toList())) - .peek(lst -> lst.removeIf(this::isJunkWord)) - .limit(24) - .collect(Collectors.toList()); - } - else { - return List.of(items); - } - } - - - - public List> permuteQueries(List items) { - int start = -1; - int end = items.size(); - - for (int i = 0; i < items.size(); i++) { - var token = items.get(i); - - if (start < 0) { - if (token.type == TokenType.LITERAL_TERM && WordPatterns.wordQualitiesPredicate.test(token.str)) { - start = i; - } - } - else { - if (token.type != TokenType.LITERAL_TERM || !WordPatterns.wordPredicateEither.test(token.str)) { - end = i; - break; - } - } - } - - if (start >= 0 && end - start > 1) { - List> permuteParts = combineSearchTerms(items.subList(start, end)); - int s = start; - int e = end; - return permuteParts.stream().map(part -> - concat(items.subList(0, s).stream(), concat(part.stream(), items.subList(e, items.size()).stream())) - .collect(Collectors.toList())) - .peek(lst -> lst.removeIf(this::isJunkWord)) - .limit(24) - .collect(Collectors.toList()); - } - else { - return List.of(items); - } - } - - - public List> permuteQueriesNew(List items) { - int start = -1; - int end = items.size(); - - for (int i = 0; i < items.size(); i++) { - var token = items.get(i); - - if (start < 0) { - if (token.type == TokenType.LITERAL_TERM && WordPatterns.wordQualitiesPredicate.test(token.str)) { - start = i; - } - } - else { - if (token.type != TokenType.LITERAL_TERM || !WordPatterns.wordPredicateEither.test(token.str)) { - end = i; - break; - } - } - } - - if (start >= 0 && end - start >= 1) { - var result = queryVariants.getQueryVariants(items.subList(start, end)); - - logger.debug("{}", result); - - if (result.isEmpty()) { - logger.warn("Empty variants result, falling back on old code"); - return permuteQueries(items); - } - - List> queryVariants = new ArrayList<>(); - for (var query : result.faithful) { - var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); - tokens.addAll(result.nonLiterals); - - queryVariants.add(tokens); - } - for (var query : result.alternative) { - if (queryVariants.size() >= 6) - break; - - var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); - tokens.addAll(result.nonLiterals); - - queryVariants.add(tokens); - } - - List> returnValue = new ArrayList<>(queryVariants.size()); - for (var variant: queryVariants) { - List r = new ArrayList<>(start + variant.size() + (items.size() - end)); - r.addAll(items.subList(0, start)); - r.addAll(variant); - r.addAll(items.subList(end, items.size())); - returnValue.add(r); - } - - return returnValue; - - } - else { - return List.of(items); - } - } - - private boolean isJunkWord(Token token) { - if (WordPatterns.isStopWord(token.str) && - !token.str.matches("^(\\d+|([a-z]+:.*))$")) { - return true; - } - return switch (token.str) { - case "vs", "versus", "or", "and" -> true; - default -> false; - }; - } - - private List> getVariantSearchTerms(List subList) { - int size = subList.size(); - if (size < 1) { - return Collections.emptyList(); - } - else if (size == 1) { - if (WordPatterns.isStopWord(subList.get(0).str)) { - return Collections.emptyList(); - } - return getWordVariants(subList.get(0)).map(List::of).collect(Collectors.toList()); - } - - List> cdrs = getVariantSearchTerms(subList.subList(1, subList.size())); - List cars = getWordVariants(subList.get(0)).collect(Collectors.toList()); - - List> ret = new ArrayList<>(cars.size() * cdrs.size()); - for (var car : cars) { - if (ret.size() >= 32) { - break; - } - for (var cdr : cdrs) { - ret.add(List.of(joinTokens(prepend(car, cdr)))); - } - } - return ret; - } - - private Stream getWordVariants(Token token) { - var s = token.str; - int sl = s.length(); - Stream base = Stream.of(token); - Stream alternatives; - if (sl < 2) { - return base; - } - if (s.endsWith("s")) { - alternatives = Stream.of(s.substring(0, sl-1), s + "es"); - } - else if (s.matches(".*(\\w)\\1ing$") && sl > 4) { // humming, clapping - var basea = s.substring(0, sl-4); - var baseb = s.substring(0, sl-3); - alternatives = Stream.of(basea, baseb + "ed"); - } - else { - alternatives = Stream.of(s+"s", s+"ing", s+"ed"); - } - - return Stream.concat(Stream.of(token), alternatives.filter(englishDictionary::isWord).map(str -> new Token(token.type, str, token.displayStr))); - } - - private List prepend(Token t, List lst) { - List ret = new ArrayList<>(lst.size() + 1); - ret.add(t); - ret.addAll(lst); - return ret; - } - - private List> combineSearchTerms(List subList) { - int size = subList.size(); - if (size < 1) { - return Collections.emptyList(); - } - else if (size == 1) { - if (WordPatterns.isStopWord(subList.get(0).str)) { - return Collections.emptyList(); - } - return List.of(subList); - } - - List> results = new ArrayList<>(size*(size+1)/2); - - if (subList.size() <= 4 && subList.get(0).str.length() >= 2 && !isPrefixWord(subList.get(subList.size()-1).str)) { - results.add(List.of(joinTokens(subList))); - } -outer: for (int i = size - 1; i >= 1; i--) { - - var left = combineSearchTerms(subList.subList(0, i)); - var right = combineSearchTerms(subList.subList(i, size)); - - for (var l : left) { - if (results.size() > 48) { - break outer; - } - - for (var r : right) { - if (results.size() > 48) { - break outer; - } - - List combined = new ArrayList<>(l.size() + r.size()); - combined.addAll(l); - combined.addAll(r); - if (!results.contains(combined)) { - results.add(combined); - } - } - } - } - if (!results.contains(subList)) { - results.add(subList); - } - Comparator> tc = (o1, o2) -> { - int dJoininess = o2.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum() - - o1.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum(); - if (dJoininess == 0) { - return (o2.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum() - - o1.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum()); - } - return (int) Math.signum(dJoininess); - }; - results.sort(tc); - return results; - } - - private boolean isPrefixWord(String str) { - return switch (str) { - case "the", "of", "when" -> true; - default -> false; - }; - } - - private boolean isSuffixWord(String str) { - return (str.length() < 2); - } - - - int joininess(String s) { - return (int) s.chars().filter(c -> c == '_').count(); - } - int rightiness(String s) { - int rightiness = 0; - for (int i = 0; i < s.length(); i++) { - if (s.charAt(i) == '_') { - rightiness+=i; - } - } - return rightiness; - } - - private Token joinTokens(List subList) { - return new Token(TokenType.LITERAL_TERM, - subList.stream().map(t -> t.str).collect(Collectors.joining("_")), - subList.stream().map(t -> t.str).collect(Collectors.joining(" "))); - } -} - -@ToString @EqualsAndHashCode -class Token { - public TokenType type; - public String str; - public final String displayStr; - - Token(TokenType type, String str, String displayStr) { - this.type = type; - this.str = str; - this.displayStr = safeString(displayStr); - } - - - Token(TokenType type, String str) { - this.type = type; - this.str = str; - this.displayStr = safeString(str); - } - - private static String safeString(String s) { - return s.replaceAll("<", "<") - .replaceAll(">", ">"); - } -} - -enum TokenType implements Predicate { - TERM, - - - LITERAL_TERM, - QUOT_TERM, - EXCLUDE_TERM, - ADVICE_TERM, - PRIORTY_TERM, - - QUALITY_TERM, - YEAR_TERM, - SIZE_TERM, - RANK_TERM, - NEAR_TERM, - - QS_TERM, - - QUOT, - MINUS, - QMARK, - LPAREN, - RPAREN, - - IGNORE; - - public boolean test(Token t) { - return t.type == this; - } -} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/model/EdgeUserSearchParameters.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/model/EdgeUserSearchParameters.java deleted file mode 100644 index a84273fb..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/model/EdgeUserSearchParameters.java +++ /dev/null @@ -1,7 +0,0 @@ -package nu.marginalia.wmsa.edge.search.query.model; - -import nu.marginalia.wmsa.edge.search.command.SearchJsParameter; -import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; - -public record EdgeUserSearchParameters (String humanQuery, EdgeSearchProfile profile, SearchJsParameter jsSetting) { -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/UrlDeduplicator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/UrlDeduplicator.java deleted file mode 100644 index 603731a7..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/UrlDeduplicator.java +++ /dev/null @@ -1,39 +0,0 @@ -package nu.marginalia.wmsa.edge.search.results; - -import gnu.trove.map.hash.TObjectIntHashMap; -import gnu.trove.set.hash.TIntHashSet; -import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; - -public class UrlDeduplicator { - private final TIntHashSet seenSuperficialhashes = new TIntHashSet(200); - private final TIntHashSet seenDataHashes = new TIntHashSet(200); - private final TObjectIntHashMap keyCount = new TObjectIntHashMap<>(200, 0.75f, 0); - - private final int resultsPerKey; - public UrlDeduplicator(int resultsPerKey) { - this.resultsPerKey = resultsPerKey; - } - - public boolean shouldRemove(EdgeUrlDetails details) { - return !filter(details); - } - public synchronized boolean filter(EdgeUrlDetails details) { - if (!seenSuperficialhashes.add(details.getSuperficialHash())) { - return false; - } - if (!seenDataHashes.add(details.getDataHash())) { - return false; - } - final var domain = details.getUrl().getDomain(); - final String key; - - if (!details.isSpecialDomain()) { - key = domain.getLongDomainKey(); - } - else { - key = domain.getDomainKey(); - } - - return keyCount.adjustOrPutValue(key, 1, 1) < resultsPerKey; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchApiQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchApiQueryService.java deleted file mode 100644 index 6c5ae7fd..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchApiQueryService.java +++ /dev/null @@ -1,55 +0,0 @@ -package nu.marginalia.wmsa.edge.search.svc; - -import com.google.common.base.Strings; -import com.google.inject.Inject; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.api.model.ApiSearchResult; -import nu.marginalia.wmsa.api.model.ApiSearchResults; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.search.EdgeSearchOperator; -import nu.marginalia.wmsa.edge.search.command.SearchJsParameter; -import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; -import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; -import spark.Request; -import spark.Response; - -import java.util.stream.Collectors; - -public class EdgeSearchApiQueryService { - private EdgeSearchOperator searchOperator; - - @Inject - public EdgeSearchApiQueryService(EdgeSearchOperator searchOperator) { - this.searchOperator = searchOperator; - } - - @SneakyThrows - public Object apiSearch(Request request, Response response) { - - final var ctx = Context.fromRequest(request); - final String queryParam = request.queryParams("query"); - final int limit; - EdgeSearchProfile profile = EdgeSearchProfile.YOLO; - - String count = request.queryParamOrDefault("count", "20"); - limit = Integer.parseInt(count); - - String index = request.queryParamOrDefault("index", "0"); - if (!Strings.isNullOrEmpty(index)) { - profile = switch (index) { - case "0" -> EdgeSearchProfile.YOLO; - case "1" -> EdgeSearchProfile.MODERN; - case "2" -> EdgeSearchProfile.DEFAULT; - case "3" -> EdgeSearchProfile.CORPO_CLEAN; - default -> EdgeSearchProfile.CORPO_CLEAN; - }; - } - - final String humanQuery = queryParam.trim(); - - var results = searchOperator.doApiSearch(ctx, new EdgeUserSearchParameters(humanQuery, profile, SearchJsParameter.DEFAULT)); - - return new ApiSearchResults("RESTRICTED", humanQuery, results.stream().map(ApiSearchResult::new).limit(limit).collect(Collectors.toList())); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchDomainSearchService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchDomainSearchService.java deleted file mode 100644 index 74e8681c..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchDomainSearchService.java +++ /dev/null @@ -1,65 +0,0 @@ -package nu.marginalia.wmsa.edge.search.svc; - -import com.google.inject.Inject; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeIdList; -import nu.marginalia.wmsa.edge.model.id.EdgeIdSet; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; -import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; -import nu.marginalia.wmsa.edge.search.model.BrowseResult; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -public class EdgeSearchDomainSearchService { - - private final EdgeIndexClient indexClient; - private final EdgeDataStoreDao edgeDataStoreDao; - - @Inject - public EdgeSearchDomainSearchService(EdgeIndexClient indexClient, EdgeDataStoreDao edgeDataStoreDao) { - this.indexClient = indexClient; - this.edgeDataStoreDao = edgeDataStoreDao; - } - - - public List getDomainResults(Context ctx, EdgeSearchSpecification specs) { - - List keywords = getKeywordsFromSpecs(specs); - - if (keywords.isEmpty()) - return Collections.emptyList(); - - List requests = new ArrayList<>(keywords.size()); - - for (var keyword : keywords) { - requests.add(new EdgeDomainSearchSpecification(keyword, - 1_000_000, 3, 25)); - } - - EdgeIdSet dedup = new EdgeIdSet<>(); - EdgeIdList values = new EdgeIdList<>(); - - for (var result : indexClient.queryDomains(ctx, requests)) { - for (int id : result.getResults().values()) { - if (dedup.add(id)) - values.add(id); - } - } - - return edgeDataStoreDao.getBrowseResultFromUrlIds(values); - } - - - private List getKeywordsFromSpecs(EdgeSearchSpecification specs) { - return specs.subqueries.stream() - .filter(sq -> sq.searchTermsExclude.isEmpty() && sq.searchTermsInclude.size() == 1 && sq.searchTermsAdvice.isEmpty()) - .map(sq -> sq.searchTermsInclude.get(0)) - .distinct() - .toList(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchWikiArticlesService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchWikiArticlesService.java deleted file mode 100644 index 7a197763..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchWikiArticlesService.java +++ /dev/null @@ -1,41 +0,0 @@ -package nu.marginalia.wmsa.edge.search.svc; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import io.reactivex.rxjava3.core.Observable; -import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; -import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient; -import org.jetbrains.annotations.NotNull; - -import java.util.concurrent.Future; - -@Singleton -public class EdgeSearchWikiArticlesService { - private final EncyclopediaClient encyclopediaClient; - - @Inject - public EdgeSearchWikiArticlesService(EncyclopediaClient encyclopediaClient) { - this.encyclopediaClient = encyclopediaClient; - } - - @NotNull - public Future getWikiArticle(Context ctx, String humanQuery) { - - if (!encyclopediaClient.isAlive()) { - return Observable.just(new WikiArticles()).toFuture(); - } - - return encyclopediaClient - .encyclopediaLookup(ctx, - humanQuery.replaceAll("\\s+", "_") - .replaceAll("\"", "") - ) - .subscribeOn(Schedulers.io()) - .onErrorReturn(e -> new WikiArticles()) - .toFuture() - ; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/EncyclopediaLoaderTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/EncyclopediaLoaderTool.java deleted file mode 100644 index f3582b12..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/EncyclopediaLoaderTool.java +++ /dev/null @@ -1,85 +0,0 @@ -package nu.marginalia.wmsa.edge.tools; - -import com.github.luben.zstd.Zstd; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.util.ParallelPipe; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner; -import org.mariadb.jdbc.Driver; -import org.openzim.ZIMTypes.ZIMFile; -import org.openzim.ZIMTypes.ZIMReader; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.SQLException; - -public class EncyclopediaLoaderTool extends ParallelPipe implements AutoCloseable { - - public static void main(String[] args) throws IOException, InterruptedException, SQLException { - - org.mariadb.jdbc.Driver driver = new Driver(); - - try (var loader = new EncyclopediaLoaderTool(new DatabaseModule().provideConnection())) { - var zr = new ZIMReader(new ZIMFile(args[0])); - - zr.forEachArticles((url, art) -> { - if (art != null) { - loader.accept(new ArticleRaw(url, art)); - } - }, p->true); - - } - catch (Exception ex) { - ex.printStackTrace(); - } - System.exit(0); - } - - public record ArticleRaw(String url, String art) { - public ArticleProcessed toProcessed(String data) { - return new ArticleProcessed(url, data); - } - } - public record ArticleProcessed(String url, String art) {} - - - private final HikariDataSource dataSource; - private final Connection connection; - private final PreparedStatement insertArticleDataStatement; - - private final WikiCleaner wikiCleaner = new WikiCleaner(); - - public EncyclopediaLoaderTool(HikariDataSource dataSource) throws SQLException { - super("EncyclopediaPipe", 24, 4, 2); - this.dataSource = dataSource; - this.connection = dataSource.getConnection(); - this.insertArticleDataStatement = connection.prepareStatement("REPLACE INTO REF_WIKI_ARTICLE(NAME, ENTRY) VALUES (?, ?)"); - - } - - @Override - protected ArticleProcessed onProcess(ArticleRaw articleRaw) { - return articleRaw.toProcessed(wikiCleaner.cleanWikiJunk("https://en.wikipedia.org/wiki/" + articleRaw.url, articleRaw.art)); - } - - @Override - protected void onReceive(ArticleProcessed articleProcessed) throws Exception { - if (articleProcessed.art == null) return; - - try (var bs = new ByteArrayInputStream(Zstd.compress(articleProcessed.art.getBytes(StandardCharsets.UTF_8)))) { - insertArticleDataStatement.setString(1, articleProcessed.url); - insertArticleDataStatement.setBlob(2, bs); - insertArticleDataStatement.executeUpdate(); - } - } - - public void close() throws Exception { - join(); - if (insertArticleDataStatement != null) insertArticleDataStatement.close(); - if (connection != null) connection.close(); - if (dataSource != null) dataSource.close(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java deleted file mode 100644 index a20fc294..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java +++ /dev/null @@ -1,85 +0,0 @@ -package nu.marginalia.wmsa.edge.tools; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.model.id.EdgeId; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.SQLException; -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; - -public class FeaturesLoaderTool { - public static void main(String... args) { - - HtmlFeature feature = HtmlFeature.valueOf(args[0]); - Path file = Path.of(args[1]); - - try (EdgeIndexClient client = new EdgeIndexClient(); - HikariDataSource ds = new DatabaseModule().provideConnection(); - Connection conn = ds.getConnection(); - PreparedStatement ps = conn.prepareStatement("UPDATE EC_PAGE_DATA SET FEATURES = FEATURES | ? WHERE ID=?"); - var linesStream = Files.lines(file)) { - - var urls = getUrls(ds); - linesStream - .map(urls::get) - .filter(Objects::nonNull) - .forEach(id -> { - int urlId = (int)(id & 0xFFFF_FFFFL); - int domainId = (int)(id >>> 32L); - - try { - ps.setInt(2, urlId); - ps.setInt(1, feature.getFeatureBit()); - ps.executeUpdate(); - } - catch (SQLException ex) { - throw new RuntimeException(ex); - } - - client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), - new EdgePageDocumentsMetadata(EdgePageDocumentsMetadata.defaultValue()), - new DocumentKeywords(new String[] { feature.getKeyword() }, new long[] { 0 }) - , 0); - }); - - } catch (IOException | SQLException e) { - throw new RuntimeException(e); - } - } - - private static Map getUrls(HikariDataSource ds) { - - Map urls = new HashMap<>(100_000); - - try (var conn = ds.getConnection(); - var stmt = conn.createStatement()) - { - var rsp = stmt.executeQuery("SELECT URL, ID, DOMAIN_ID FROM EC_URL_VIEW WHERE TITLE IS NOT NULL"); - - while (rsp.next()) { - long val = rsp.getInt(3); - val = (val << 32L) | rsp.getInt(2); - - urls.put(rsp.getString(1), val); - } - - } - catch (SQLException ex) { - throw new RuntimeException(ex); - } - - return urls; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexJournalDumpTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexJournalDumpTool.java deleted file mode 100644 index 323b1279..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexJournalDumpTool.java +++ /dev/null @@ -1,47 +0,0 @@ -package nu.marginalia.wmsa.edge.tools; - -import com.google.common.hash.Hashing; -import net.agkn.hll.HLL; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; - -import java.io.IOException; -import java.nio.file.Path; - -public class IndexJournalDumpTool { - public static void main(String... args) throws IOException { - final String operation = args.length > 0 ? args[0] : "help"; - - switch (operation) { - case "dump": - dump(Path.of(args[1])); - break; - case "cardinality": - cardinality(Path.of(args[1])); - break; - default: - System.err.println("Usage: dump|cardinality index-file.dat"); - break; - } - - } - - private static void cardinality(Path file) throws IOException { - var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(file)); - HLL hyperloglog = new HLL(30, 1); - var hashFunction = Hashing.murmur3_128(); - - for (var entry : reader) { - hyperloglog.addRaw(hashFunction.hashLong(entry.docId()).padToLong()); - } - - System.out.println(hyperloglog.cardinality()); - } - - private static void dump(Path file) throws IOException { - var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(file)); - for (var entry : reader) { - System.out.printf("%s\t%010d\t%06d:%08d\n", entry.docId(), entry.domainId(), entry.urlId()); - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/SearchIndexScrubberMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/SearchIndexScrubberMain.java deleted file mode 100644 index 9cb945e4..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/SearchIndexScrubberMain.java +++ /dev/null @@ -1,69 +0,0 @@ -package nu.marginalia.wmsa.edge.tools; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.file.Path; - -public class SearchIndexScrubberMain { - public static final Logger logger = LoggerFactory.getLogger(SearchIndexScrubberMain.class); - private static final int CHUNK_HEADER_SIZE = 16; - - public static void main(String... args) throws IOException { - var inputFile = Path.of(args[0]).toFile(); - var outputFile = Path.of(args[1]).toFile(); - - logger.info("Scrubbing {}", inputFile); - - final RandomAccessFile raf = new RandomAccessFile(inputFile, "r"); - - var fileLength = raf.readLong(); - var wordCount = raf.readInt(); - - logger.info("Word Count: {}", wordCount); - logger.info("File Length: {}", fileLength); - - var channel = raf.getChannel(); - - ByteBuffer inByteBuffer = ByteBuffer.allocateDirect(10_000); - - RandomAccessFile[] randomAccessFiles = new RandomAccessFile[1]; - - for (int i = 0; i < randomAccessFiles.length; i++) { - randomAccessFiles[i] = new RandomAccessFile(outputFile, "rw"); - randomAccessFiles[i].seek(12); - } - FileChannel[] fileChannels = new FileChannel[1]; - for (int i = 0; i < fileChannels.length; i++) { - fileChannels[i] = randomAccessFiles[i].getChannel(); - } - - while (channel.position() < fileLength) { - inByteBuffer.clear(); - inByteBuffer.limit(CHUNK_HEADER_SIZE); - channel.read(inByteBuffer); - inByteBuffer.flip(); - long urlId = inByteBuffer.getLong(); - int chunkBlock = inByteBuffer.getInt(); - int count = inByteBuffer.getInt(); - inByteBuffer.clear(); - inByteBuffer.limit(count*4+CHUNK_HEADER_SIZE); - inByteBuffer.putLong(urlId); - inByteBuffer.putInt(chunkBlock); - inByteBuffer.putInt(count); - channel.read(inByteBuffer); - } - - long size = randomAccessFiles[0].getFilePointer(); - - randomAccessFiles[0].seek(0); - randomAccessFiles[0].writeLong(size); - randomAccessFiles[0].writeInt(wordCount); - - randomAccessFiles[0].close(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/StripSimpleJournalEntriesToolMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/StripSimpleJournalEntriesToolMain.java deleted file mode 100644 index c3d4a1e6..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/StripSimpleJournalEntriesToolMain.java +++ /dev/null @@ -1,28 +0,0 @@ -package nu.marginalia.wmsa.edge.tools; - -import nu.marginalia.util.array.LongArray; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalCleaner; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReadEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; - -import java.io.IOException; -import java.nio.file.Path; - -import static nu.marginalia.wmsa.edge.index.model.EdgePageDocumentFlags.Simple; - -public class StripSimpleJournalEntriesToolMain { - - public static void main(String[] args) throws IOException { - Path input = Path.of(args[0]); - Path output = Path.of(args[1]); - - new SearchIndexJournalCleaner(new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(input))) - .clean(output, StripSimpleJournalEntriesToolMain::retainEntry); - - System.out.println("All done!"); - } - - private static boolean retainEntry(SearchIndexJournalReadEntry entry) { - return (entry.header.documentMeta() & Simple.asBit()) == 0; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java deleted file mode 100644 index c2215526..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java +++ /dev/null @@ -1,35 +0,0 @@ -package nu.marginalia.wmsa.encyclopedia; - -import io.reactivex.rxjava3.core.Observable; -import nu.marginalia.wmsa.client.AbstractDynamicClient; -import nu.marginalia.wmsa.client.HttpStatusCode; -import nu.marginalia.wmsa.client.exception.RouteNotConfiguredException; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; -import okhttp3.MediaType; -import org.eclipse.jetty.util.UrlEncoded; - -import javax.annotation.CheckReturnValue; - -public class EncyclopediaClient extends AbstractDynamicClient { - public EncyclopediaClient() { - super(ServiceDescriptor.ENCYCLOPEDIA); - } - - @CheckReturnValue - public Observable submitWiki(Context ctx, String url, String data) { - return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8")); - } - - @CheckReturnValue - public Observable encyclopediaLookup(Context ctx, String word) { - try { - return super.get(ctx, "/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class); - } - catch (RouteNotConfiguredException ex) { - return Observable.fromSupplier(WikiArticles::new); - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaDao.java deleted file mode 100644 index d9b9d5e9..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaDao.java +++ /dev/null @@ -1,154 +0,0 @@ -package nu.marginalia.wmsa.encyclopedia; - -import com.github.luben.zstd.ZstdInputStream; -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; -import nu.marginalia.wmsa.edge.assistant.dict.WikiSearchResult; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.OutputStream; -import java.util.*; -import java.util.stream.Collectors; - -public class EncyclopediaDao { - - private final HikariDataSource dataSource; - private static final Logger logger = LoggerFactory.getLogger(EncyclopediaDao.class); - - @Inject - public EncyclopediaDao(HikariDataSource dataSource) { - this.dataSource = dataSource; - } - - public boolean getWikiArticleData(String name, OutputStream outputStream) { - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement("SELECT ENTRY FROM REF_WIKI_ARTICLE WHERE NAME=? AND ENTRY IS NOT NULL")) - { - stmt.setString(1, name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - new ZstdInputStream(rsp.getBlob(1).getBinaryStream()).transferTo(outputStream); - return true; - } - } - catch (Exception ex) { - logger.error("Failed to fetch article", ex); - } - return false; - } - - public WikiArticles encyclopedia(String term) { - WikiArticles response = new WikiArticles(); - response.entries = new ArrayList<>(); - - try (var connection = dataSource.getConnection()) { - var stmt = connection.prepareStatement("SELECT DISTINCT(NAME) FROM REF_WIKI_ARTICLE WHERE NAME=?"); - stmt.setString(1, term); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - response.entries.add(capitalizeWikiString(rsp.getString(1))); - } - } - catch (Exception ex) { - logger.error("Failed to fetch articles", ex); - return new WikiArticles(); - } - - return response; - } - - public Optional resolveEncylopediaRedirect(String term) { - final List matches = new ArrayList<>(); - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_ARTICLE WHERE NAME=?")) { - stmt.setString(1, term); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - if (term.equals(rsp.getString(1)) - || rsp.getString(2) == null) { - return Optional.ofNullable(rsp.getString(2)); - } else { - matches.add(rsp.getString(2)); - } - } - } - } - catch (Exception ex) { - throw new RuntimeException(ex); - } - - if (!matches.isEmpty()) { - return Optional.of(matches.get(0)); - } - return Optional.empty(); - } - - - public List findEncyclopediaPages(String term) { - final List directMatches = new ArrayList<>(); - final Set directSearchMatches = new HashSet<>(); - final Set indirectMatches = new HashSet<>(); - - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_ARTICLE WHERE NAME=?")) { - stmt.setString(1, term.replace(' ', '_')); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - String name = rsp.getString(1); - String refName = rsp.getString(2); - - if (refName == null) { - directMatches.add(new WikiSearchResult(name, null)); - } else { - indirectMatches.add(new WikiSearchResult(name, refName)); - } - } - } - - try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_ARTICLE WHERE NAME LIKE ? LIMIT 10")) { - stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%"); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - String name = rsp.getString(1); - String refName = rsp.getString(2); - - if (refName == null) { - directSearchMatches.add(new WikiSearchResult(name, null)); - } else { - indirectMatches.add(new WikiSearchResult(name, refName)); - } - } - } - } - catch (Exception ex) { - throw new RuntimeException(ex); - } - - directMatches.forEach(indirectMatches::remove); - indirectMatches.removeAll(directSearchMatches); - directMatches.forEach(directSearchMatches::remove); - directMatches.addAll(indirectMatches); - directMatches.addAll(directSearchMatches); - return directMatches; - } - - private String capitalizeWikiString(String string) { - if (string.contains("_")) { - return Arrays.stream(string.split("_")).map(this::capitalizeWikiString).collect(Collectors.joining("_")); - } - if (string.length() < 2) { - return string.toUpperCase(); - } - return Character.toUpperCase(string.charAt(0)) + string.substring(1).toLowerCase(); - } - - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java deleted file mode 100644 index ee364dcc..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java +++ /dev/null @@ -1,29 +0,0 @@ -package nu.marginalia.wmsa.encyclopedia; - -import com.google.inject.Guice; -import com.google.inject.Inject; -import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; - -public class EncyclopediaMain extends MainClass { - private final EncyclopediaService service; - - public static void main(String... args) { - init(ServiceDescriptor.ENCYCLOPEDIA, args); - - Injector injector = Guice.createInjector( - new EncyclopediaModule(), - new DatabaseModule(), - new ConfigurationModule()); - injector.getInstance(EncyclopediaMain.class); - } - - @Inject - public EncyclopediaMain(EncyclopediaService service) { - this.service = service; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java deleted file mode 100644 index 74301b2d..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java +++ /dev/null @@ -1,9 +0,0 @@ -package nu.marginalia.wmsa.encyclopedia; - -import com.google.inject.AbstractModule; - -public class EncyclopediaModule extends AbstractModule { - @Override - public void configure() { - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java deleted file mode 100644 index 468e5f22..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java +++ /dev/null @@ -1,94 +0,0 @@ -package nu.marginalia.wmsa.encyclopedia; - -import com.google.gson.Gson; -import com.google.inject.Inject; -import com.google.inject.name.Named; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import spark.Request; -import spark.Response; -import spark.Spark; - -import java.io.IOException; -import java.util.Map; - -public class EncyclopediaService extends Service { - - private static final Logger logger = LoggerFactory.getLogger(EncyclopediaService.class); - private final MustacheRenderer wikiErrorPageRenderer; - private final MustacheRenderer wikiSearchResultRenderer; - - private final EncyclopediaDao encyclopediaDao; - - @Inject - public EncyclopediaService(@Named("service-host") String ip, - @Named("service-port") Integer port, - EncyclopediaDao encyclopediaDao, - RendererFactory rendererFactory, - Initialization initialization, - MetricsServer metricsServer) - throws IOException { - - super(ip, port, initialization, metricsServer); - this.encyclopediaDao = encyclopediaDao; - - if (rendererFactory != null) { - wikiErrorPageRenderer = rendererFactory.renderer("encyclopedia/wiki-error"); - wikiSearchResultRenderer = rendererFactory.renderer("encyclopedia/wiki-search"); - } - else { - wikiErrorPageRenderer = null; - wikiSearchResultRenderer = null; - } - - Gson gson = GsonFactory.get(); - - Spark.get("/public/wiki/*", this::getWikiPage); - Spark.get("/public/wiki-search", this::searchWikiPage); - Spark.get("/encyclopedia/:term", (rq, rsp) -> encyclopediaDao.encyclopedia(rq.params("term")), gson::toJson); - - Spark.awaitInitialization(); - } - - @SneakyThrows - private Object getWikiPage(Request req, Response rsp) { - final String[] splats = req.splat(); - - if (splats.length == 0) - rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html"); - - final String name = splats[0]; - - String pageName = encyclopediaDao.resolveEncylopediaRedirect(name).orElse(name); - logger.info("Resolved {} -> {}", name, pageName); - - if (!encyclopediaDao.getWikiArticleData(name, rsp.raw().getOutputStream())) { - return wikiErrorPageRenderer.render("https://en.wikipedia.org/wiki/" + name); - } - return ""; - } - - @SneakyThrows - private Object searchWikiPage(Request req, Response rsp) { - String term = req.queryParams("query"); - - if (null == term) { - rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html"); - return ""; - } - - return wikiSearchResultRenderer.render( - Map.of("query", term, - "results", - encyclopediaDao.findEncyclopediaPages(term)) - ); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/client/MemexApiClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/client/MemexApiClient.java deleted file mode 100644 index b038637d..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/client/MemexApiClient.java +++ /dev/null @@ -1,14 +0,0 @@ -package nu.marginalia.wmsa.memex.client; - -import com.google.inject.Inject; -import nu.marginalia.wmsa.client.AbstractDynamicClient; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; - - -public class MemexApiClient extends AbstractDynamicClient { - @Inject - public MemexApiClient() { - super(ServiceDescriptor.MEMEX); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererableDirect.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererableDirect.java deleted file mode 100644 index 571d5f56..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererableDirect.java +++ /dev/null @@ -1,7 +0,0 @@ -package nu.marginalia.wmsa.memex.model.render; - -import nu.marginalia.wmsa.memex.renderer.MemexHtmlRenderer; - -public interface MemexRendererableDirect { - String render(MemexHtmlRenderer renderer); -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/StatusRendererService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/StatusRendererService.java deleted file mode 100644 index 3d2e3d4e..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/StatusRendererService.java +++ /dev/null @@ -1,81 +0,0 @@ -package nu.marginalia.wmsa.renderer; - -import com.google.inject.Inject; -import io.reactivex.rxjava3.schedulers.Schedulers; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; -import nu.marginalia.wmsa.resource_store.ResourceStoreClient; -import nu.marginalia.wmsa.resource_store.model.RenderedResource; -import okhttp3.OkHttpClient; -import okhttp3.Request; - -import java.time.LocalDateTime; -import java.time.temporal.ChronoUnit; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; - -public class StatusRendererService { - private final MustacheRenderer statusRenderer; - private final ResourceStoreClient resourceStoreClient; - - private final OkHttpClient client; - - private final RendererFactory rendererFactory = new RendererFactory(); - - @Inject - @SneakyThrows - public StatusRendererService(ResourceStoreClient resourceStoreClient) { - this.resourceStoreClient = resourceStoreClient; - - client = new OkHttpClient.Builder() - .connectTimeout(50, TimeUnit.MILLISECONDS) - .readTimeout(1, TimeUnit.SECONDS) - .retryOnConnectionFailure(false) - .followRedirects(false) - .build(); - statusRenderer = rendererFactory.renderer( "status/server-status"); - } - - public void start() { - Schedulers.io().schedulePeriodicallyDirect(this::renderStatusPage, 1, 60, TimeUnit.SECONDS); - } - public void renderStatusPage() { - try { - var status = getStatus(); - var page = statusRenderer.render(Map.of("status", status)); - resourceStoreClient - .putResource(Context.internal(), "status", - new RenderedResource("index.html", LocalDateTime.now().plus(2, ChronoUnit.MINUTES), page)) - .blockingSubscribe(); - } catch (Exception e) { - e.printStackTrace(); - } - } - - private List getStatus() { - List status = new ArrayList<>(ServiceDescriptor.values().length); - - for (ServiceDescriptor sd : ServiceDescriptor.values()) { - if (sd.port == 0) { - continue; - } - try { - var req = new Request.Builder().url("http://127.0.0.1:" + sd.port + "/internal/ping").get().build(); - var call = client.newCall(req); - - call.execute().close(); - status.add(new ServerStatusModel(sd.name, "UP")); - - } catch (Exception e) { - status.add(new ServerStatusModel(sd.name, "DOWN")); - } - } - return status; - } - -} diff --git a/marginalia_nu/src/main/resources/data/smhi/stader.csv b/marginalia_nu/src/main/resources/data/smhi/stader.csv deleted file mode 100644 index 0dc649c4..00000000 --- a/marginalia_nu/src/main/resources/data/smhi/stader.csv +++ /dev/null @@ -1,134 +0,0 @@ -"Åkersberga",59.47944,18.29967 -"Alby",59.2335,17.8538 -"Alingsås",57.93033,12.53345 -"Ängelholm",56.2428,12.86219 -"Arboga",59.39387,15.83882 -"Årsta",59.2978,18.0514 -"Arvika",59.65528,12.58518 -"Avesta",60.14274,16.16295 -"Bålsta",59.5671,17.52781 -"Boden",65.82518,21.68864 -"Bollnäs",61.34817,16.39464 -"Boo",59.33333,18.28333 -"Borås",57.72101,12.9401 -"Borlänge",60.4858,15.43714 -"Bromma",59.34,17.94 -"Enköping",59.63607,17.07768 -"Eskilstuna",59.36661,16.5077 -"Eslöv",55.83928,13.30393 -"Fagersta",60.00418,15.79316 -"Falkenberg",56.90552,12.49118 -"Falköping",58.17347,13.55068 -"Falun",60.60357,15.62597 -"Finspång",58.70578,15.76739 -"Gävle",60.67452,17.14174 -"Gislaved",57.3044,13.54078 -"Göteborg",57.70716,11.96679 -"Hallstahammar",59.61395,16.22846 -"Halmstad",56.67446,12.85676 -"Handen",59.16809,18.13796 -"Haninge",59.16775,18.14478 -"Härnösand",62.63228,17.93794 -"Hässleholm",56.15905,13.76638 -"Helsingborg",56.04673,12.69437 -"Höganäs",56.19971,12.55795 -"Höllviken",55.40982,12.9558 -"Huddinge",59.23705,17.98192 -"Hudiksvall",61.72897,17.10358 -"Huskvarna",57.78596,14.30214 -"Jakobsberg",59.42268,17.83508 -"Jönköping",57.78145,14.15618 -"Kalmar",56.66157,16.36163 -"Karlshamn",56.1706,14.86188 -"Karlskoga",59.32667,14.52386 -"Karlskrona",56.16156,15.58661 -"Karlstad",59.3793,13.50357 -"Katrineholm",58.99587,16.20721 -"Kävlinge",55.79188,13.11021 -"Kinna",57.50728,12.69463 -"Kiruna",67.85572,20.22513 -"Kista",59.40316,17.94479 -"Köping",59.51404,15.99255 -"Kristianstad",56.03129,14.15242 -"Kristinehamn",59.30978,14.10808 -"Kumla",59.1277,15.14341 -"Kungälv",57.87096,11.98054 -"Kungsbacka",57.48719,12.07612 -"Landskrona",55.8708,12.83016 -"Lerum",57.77051,12.26904 -"Lidingö",59.36667,18.13333 -"Lidköping",58.50517,13.15765 -"Lindome",57.56667,12.08333 -"Linköping",58.41086,15.62157 -"Ljungby",56.83324,13.94082 -"Ludvika",60.14959,15.18776 -"Luleå",65.58415,22.15465 -"Lund",55.70584,13.19321 -"Majorna",57.69195,11.91605 -"Malmö",55.60587,13.00073 -"Mariestad",58.70971,13.82367 -"Märsta",59.62157,17.85476 -"Mjölby",58.32595,15.12365 -"Mölndal",57.6554,12.01378 -"Mölnlycke",57.65893,12.11792 -"Mora",61.00704,14.54316 -"Motala",58.53706,15.03649 -"Nacka",59.31053,18.16372 -"Nässjö",57.65307,14.69676 -"Norrköping",58.59419,16.1826 -"Norrtälje",59.75799,18.70496 -"Nybro",56.74461,15.90714 -"Nyköping",58.753,17.00788 -"Nynäshamn",58.90337,17.94793 -"Onsala",57.42531,12.02903 -"Örebro",59.27412,15.2066 -"Örnsköldsvik",63.29091,18.71525 -"Oskarshamn",57.26455,16.44837 -"Östermalm",59.33879,18.08487 -"Östersund",63.1792,14.63566 -"Oxelösund",58.67057,17.10152 -"Partille",57.7395,12.10642 -"Piteå",65.31717,21.47944 -"Råsunda",59.36667,17.98333 -"Ronneby",56.20999,15.27602 -"Sala",59.91993,16.60655 -"Salem",59.20186,17.76646 -"Sandviken",60.61667,16.76667 -"Segeltorp",59.27597,17.93072 -"Skara",58.38659,13.43836 -"Skellefteå",64.75067,20.95279 -"Skoghall",59.32324,13.46552 -"Skövde",58.39118,13.84506 -"Söderhamn",61.30373,17.05921 -"Södertälje",59.19554,17.62525 -"Sollentuna",59.42804,17.95093 -"Solna",59.36004,18.00086 -"Staffanstorp",55.64277,13.20638 -"Stenungsund",58.07046,11.8181 -"Stockholm",59.33258,18.0649 -"Strängnäs",59.37741,17.03119 -"Sundbyberg",59.36128,17.97114 -"Sundsvall",62.39129,17.3063 -"Täby",59.4439,18.06872 -"Timrå",62.48703,17.3257 -"Torslanda",57.72432,11.77013 -"Tranås",58.03717,14.9782 -"Trelleborg",55.37514,13.15691 -"Trollhättan",58.28365,12.28864 -"Tullinge",59.2,17.9 -"Tumba",59.19858,17.83317 -"Uddevalla",58.34784,11.9424 -"Umeå",63.82842,20.25972 -"Upplands Väsby",59.51839,17.91128 -"Uppsala",59.85882,17.63889 -"Vallentuna",59.53436,18.07758 -"Vänersborg",58.38075,12.3234 -"Varberg",57.10557,12.25078 -"Värnamo",57.18604,14.04001 -"Västerås",59.61617,16.55276 -"Västerhaninge",59.11667,18.1 -"Västervik",57.7584,16.63733 -"Växjö",56.87767,14.80906 -"Vetlanda",57.42887,15.07762 -"Visby",57.64089,18.29602 -"Ystad",55.42966,13.82041 diff --git a/marginalia_nu/src/main/resources/fonts/LM-regular.ttf b/marginalia_nu/src/main/resources/fonts/LM-regular.ttf deleted file mode 100644 index 6b4f6b8a..00000000 Binary files a/marginalia_nu/src/main/resources/fonts/LM-regular.ttf and /dev/null differ diff --git a/marginalia_nu/src/main/resources/fonts/STIXTwoMath-Regular.ttf b/marginalia_nu/src/main/resources/fonts/STIXTwoMath-Regular.ttf deleted file mode 100644 index a4705fc7..00000000 Binary files a/marginalia_nu/src/main/resources/fonts/STIXTwoMath-Regular.ttf and /dev/null differ diff --git a/marginalia_nu/src/main/resources/log4j2.properties b/marginalia_nu/src/main/resources/log4j2.properties deleted file mode 100644 index 647ca8e8..00000000 --- a/marginalia_nu/src/main/resources/log4j2.properties +++ /dev/null @@ -1,29 +0,0 @@ - - -log4j2.isThreadContextMapInheritable=true - -status = info - -appender.console.type = Console -appender.console.name = LogToConsole -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n -appender.console.filter.http.type = MarkerFilter - -appender.rolling.type = RollingFile -appender.rolling.name = RollingFile -appender.rolling.fileName = /var/log/wmsa/wmsa-${main:1}-server.log -appender.rolling.filePattern = /var/log/wmsa/wmsa-${main:1}-server-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz -appender.rolling.layout.pattern = %-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n -appender.rolling.layout.type = PatternLayout -appender.rolling.policies.type = Policies -appender.rolling.policies.size.type = SizeBasedTriggeringPolicy -appender.rolling.policies.size.size=10MB -appender.rolling.strategy.type = DefaultRolloverStrategy -appender.rolling.strategy.max = 10 - -rootLogger.level = info -rootLogger.appenderRef.console.ref = LogToConsole -rootLogger.appenderRef.rolling.ref = RollingFile - -#rootLogger.appenderRef.http.ref = LogHttpTraffic diff --git a/marginalia_nu/src/main/resources/static/dating/index.html b/marginalia_nu/src/main/resources/static/dating/index.html deleted file mode 100644 index 8626e8f8..00000000 --- a/marginalia_nu/src/main/resources/static/dating/index.html +++ /dev/null @@ -1,71 +0,0 @@ - - - - - Website Explorer - - - - -

Website Explorer

-

- This is a game where you explore more or less random and obscure websites around the Internet, based on the - database of the Marginalia Search Engine. -

-

Instructions

-

Press the thumbnail to visit the website.

-

Press ➡️ to view the next website.

-

Press 🤩 to look for websites similar to the website you are seeing.

-

Press 🔀 to return to the default flavor of websites.

-

Cookie Consent

-

- The game uses a session cookie to keep track of which websites you have been shown so that - you do not see the same websites too repeatedly, and which websites you would like to see more of. -

-

- Consent To The Cookie And Begin -

-

About

-

- These websites are not manually curated. Most of them are clean, but if you do happen to see something particularly - objectionable, please let me know by sending me an email. kontakt@marginalia.nu -

-

- A less principled person would probably have plastered the page in ads, as it's a game basically revolving around - refreshing the same page over and over. Instead I invite you to consider supporting me - if you enjoy game. -

- - diff --git a/marginalia_nu/src/main/resources/static/dating/robots.txt b/marginalia_nu/src/main/resources/static/dating/robots.txt deleted file mode 100644 index 5199c74f..00000000 --- a/marginalia_nu/src/main/resources/static/dating/robots.txt +++ /dev/null @@ -1,4 +0,0 @@ -User-agent: * -Disallow: /init -Disallow: /random -Disallow: /similar diff --git a/marginalia_nu/src/main/resources/static/edge/index.html b/marginalia_nu/src/main/resources/static/edge/index.html deleted file mode 100644 index c27c6efc..00000000 --- a/marginalia_nu/src/main/resources/static/edge/index.html +++ /dev/null @@ -1,164 +0,0 @@ - - - - - Marginalia Search - - - - - - - - - - - - - -
- -
- -
-
- -
- -
-
-

About

-
-

This is an independent DIY search engine that focuses on non-commercial content, and attempts to - show you sites you perhaps weren't aware of in favor of the sort of sites you probably already knew - existed.

-

- The software for this search engine is all custom-built, and all crawling and indexing is - done in-house. The project is open source. Feel free to poke about in the source code or contribute - to the development! -

-

Consider supporting the - project!

-
-
- Read More -
-
- -
-

Tips

-
-

- This search engine isn't particularly well equipped to answering queries - posed like questions, instead try to imagine some text that might appear - in the website you are looking for, and search for that.

-

- Where this search engine really shines is finding small, old and obscure websites about some - given topic, perhaps - old video games, - a mystery, - theology, - the occult, - knitting, - computer science, - or art. -

- -
- -
- - -
-

Updates

-
-

☛ A recipe filter has been added to the algorithm selector.

-

☛ The Random Mode has been overhauled, and is - quite entertaining. I encourage you to give it a spin.

-

☛ A simple public API is now available.

-
- -
- -
-

Publicity, Discussion and Events

-
-
-
Kritik an Googles Suche - Platzhirsch auf dem Nebenschauplatz
-
Deutschlandfunk Kultur 🇩🇪, 2022-08-18
-
Marginalia Goes Open Source
-
Hacker News, 2022-05-28
-
You Should Check Out the Indie Web 🎞️
-
YouTube, You've Got Kat, 2022-03-15
-
- What Google Search Isn't Showing You -
-
The New Yorker 🎩, 2022-03-10
-
- Marginalia Search - Serendipity Engineering -
-
MetaFilter, 2022-03-09
-
- 🎂 First anniversary! 🎊 -
-
- 2022-02-26 -
-
- A Search Engine Designed To Surprise You -
-
Clive Thompson OneZero, 2021-09-16
-
- A search engine that favors text-heavy sites and punishes modern web design -
-
- Hacker News, 2021-09-16 -
-
-
-
-
-
- -
- This website complies with the GDPR by not collecting any personal - information, and with the EU Cookie Directive by not using - cookies. More Information. -

- Reach me at kontakt@marginalia.nu. -

- - \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/static/encyclopedia/index.html b/marginalia_nu/src/main/resources/static/encyclopedia/index.html deleted file mode 100644 index 1b3f81ed..00000000 --- a/marginalia_nu/src/main/resources/static/encyclopedia/index.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - Marginalia Encyclopedia - - - - - - - - \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/static/encyclopedia/robots.txt b/marginalia_nu/src/main/resources/static/encyclopedia/robots.txt deleted file mode 100644 index 77470cb3..00000000 --- a/marginalia_nu/src/main/resources/static/encyclopedia/robots.txt +++ /dev/null @@ -1,2 +0,0 @@ -User-agent: * -Disallow: / \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/static/encyclopedia/wiki-clean.html b/marginalia_nu/src/main/resources/static/encyclopedia/wiki-clean.html deleted file mode 100644 index cd928003..00000000 --- a/marginalia_nu/src/main/resources/static/encyclopedia/wiki-clean.html +++ /dev/null @@ -1,71 +0,0 @@ - - - - - Marginalia Search - About: Easy Read Wikipedia - - - - - -
- -
-
-

About: High Readability Encyclopedia

-
-

- This is an encyclopedia based on Wikipedia's database, that strips away most links and - almost all visual clutter to provide a more book-like reading experience with fewer - distractions. -

-

- This is primarily a helpful utility for a search engine focusing on similarly text-oriented - websites. -

-

- You are welcome to use it for general article reading as well. This may be useful - if you are on a low bandwidth connection, since the download size is typically reduced - from megabytes to dozens of kilobytes. -

-

- What's taken away is all the design elements that your brain would have to filter out - to read the text of the article. It seems as though overburdening this mental process - causes the reader to start scanning the text instead of reading it, which is experienced - as an inability to pay focus. -

-

- The cleaning process is not perfect and will occasionally produce strange results, - but significant problems should be relatively rare. -

- About the Search Engine - -

Limitations

-

This is a "stale" copy of wikipedia, based on an archived copy from January 2021. On the - other hand, we used to abide printed encyclopedias that didn't update at all.

-

- Be aware that the cleaning strips away a lot of information, including most references, - footnotes, quality warnings, and so forth. Refer to the original wikipedia article for - that information. -

-
-

Legal

-
- The original Wikipedia text is available under the the Creative Commons Attribution-ShareAlike 3.0 license, - and so is the wikipedia text forwarded to you through this service. -
-
-

Further reading

-
Blom et al. 2017 - Comprehension and navigation of networked hypertexts
-
https://onlinelibrary.wiley.com/doi/pdf/10.1111/jcal.12243
-
-

Have something to say?

-
-

Send me an e-mail at kontakt@marginalia.nu. -

-
-
- \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/static/encyclopedia/wiki-start.html b/marginalia_nu/src/main/resources/static/encyclopedia/wiki-start.html deleted file mode 100644 index f0f86947..00000000 --- a/marginalia_nu/src/main/resources/static/encyclopedia/wiki-start.html +++ /dev/null @@ -1,26 +0,0 @@ - - - - - Marginalia Search - Easy Read Wikipedia Search - - - - - -
- -
-
-

Search the Encyclopedia

- -
- \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/static/explore/style.css b/marginalia_nu/src/main/resources/static/explore/style.css deleted file mode 100644 index b7af8d17..00000000 --- a/marginalia_nu/src/main/resources/static/explore/style.css +++ /dev/null @@ -1,20 +0,0 @@ -body { - max-width: 80ch; - margin: auto; - font-size: 14pt; - font-family: sans-serif; - color: #222; - line-height: 1.5; -} -th { text-align: left; } -input { font-family: monospace; font-size: 14pt; } -input[type="text"] { width: 50%; } -table { width: 100%; font-size: 14pt; } - -a.external { - color: darkcyan !important; -} -a.external:before { - content: '\01F30E'; - padding-right: .25ch; -} \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-bold-italic.ttf b/marginalia_nu/src/main/resources/static/fonts/LM-bold-italic.ttf deleted file mode 100644 index 7b684ee6..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-bold-italic.ttf and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-bold-italic.woff b/marginalia_nu/src/main/resources/static/fonts/LM-bold-italic.woff deleted file mode 100644 index d54af371..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-bold-italic.woff and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-bold-italic.woff2 b/marginalia_nu/src/main/resources/static/fonts/LM-bold-italic.woff2 deleted file mode 100644 index 078ce292..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-bold-italic.woff2 and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-bold.ttf b/marginalia_nu/src/main/resources/static/fonts/LM-bold.ttf deleted file mode 100644 index 17624d45..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-bold.ttf and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-bold.woff b/marginalia_nu/src/main/resources/static/fonts/LM-bold.woff deleted file mode 100644 index 318a3ad2..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-bold.woff and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-bold.woff2 b/marginalia_nu/src/main/resources/static/fonts/LM-bold.woff2 deleted file mode 100644 index c14c6204..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-bold.woff2 and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-italic.ttf b/marginalia_nu/src/main/resources/static/fonts/LM-italic.ttf deleted file mode 100644 index b9a57b87..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-italic.ttf and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-italic.woff b/marginalia_nu/src/main/resources/static/fonts/LM-italic.woff deleted file mode 100644 index fafb147e..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-italic.woff and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-italic.woff2 b/marginalia_nu/src/main/resources/static/fonts/LM-italic.woff2 deleted file mode 100644 index 166d6e60..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-italic.woff2 and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-regular.ttf b/marginalia_nu/src/main/resources/static/fonts/LM-regular.ttf deleted file mode 100644 index 6b4f6b8a..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-regular.ttf and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-regular.woff b/marginalia_nu/src/main/resources/static/fonts/LM-regular.woff deleted file mode 100644 index eb9fec0a..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-regular.woff and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/fonts/LM-regular.woff2 b/marginalia_nu/src/main/resources/static/fonts/LM-regular.woff2 deleted file mode 100644 index 869279ac..00000000 Binary files a/marginalia_nu/src/main/resources/static/fonts/LM-regular.woff2 and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/smhi/favicon.ico b/marginalia_nu/src/main/resources/static/smhi/favicon.ico deleted file mode 100644 index a1136a7f..00000000 Binary files a/marginalia_nu/src/main/resources/static/smhi/favicon.ico and /dev/null differ diff --git a/marginalia_nu/src/main/resources/static/smhi/font.css b/marginalia_nu/src/main/resources/static/smhi/font.css deleted file mode 100644 index 29142ae7..00000000 --- a/marginalia_nu/src/main/resources/static/smhi/font.css +++ /dev/null @@ -1,50 +0,0 @@ -/* LÅNAD KOD */ - -/*! - * LaTeX.css (https://latex.now.sh/) - * - * Source: https://github.com/vincentdoerig/latex-css - * Licensed under MIT (https://github.com/vincentdoerig/latex-css/blob/master/LICENSE) -*/ - -@font-face { - font-family: 'Latin Modern'; - font-style: normal; - font-weight: normal; - font-display: swap; - src: url('https://www.marginalia.nu/fonts/LM-regular.woff2') format('woff2'), - url('https://www.marginalia.nu/fonts/LM-regular.woff') format('woff'), - url('https://www.marginalia.nu/fonts/LM-regular.ttf') format('truetype'); -} - -@font-face { - font-family: 'Latin Modern'; - font-style: italic; - font-weight: normal; - font-display: swap; - src: url('https://www.marginalia.nu/fonts/LM-italic.woff2') format('woff2'), - url('https://www.marginalia.nu/fonts/LM-italic.woff') format('woff'), - url('https://www.marginalia.nu/fonts/LM-italic.ttf') format('truetype'); -} - -@font-face { - font-family: 'Latin Modern'; - font-style: normal; - font-weight: bold; - font-display: swap; - src: url('https://www.marginalia.nu/fonts/LM-bold.woff2') format('woff2'), - url('https://www.marginalia.nu/fonts/LM-bold.woff') format('woff'), - url('https://www.marginalia.nu/fonts/LM-bold.ttf') format('truetype'); -} - -@font-face { - font-family: 'Latin Modern'; - font-style: italic; - font-weight: bold; - font-display: swap; - src: url('https://www.marginalia.nu/fonts/LM-bold-italic.woff2') format('woff2'), - url('https://www.marginalia.nu/fonts/LM-bold-italic.woff') format('woff'), - url('https://www.marginalia.nu/fonts/LM-bold-italic.ttf') format('truetype'); -} - -/* SLUT PÅ LÅN AV KOD */ \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/static/smhi/responsive.css b/marginalia_nu/src/main/resources/static/smhi/responsive.css deleted file mode 100644 index 3f3ebc01..00000000 --- a/marginalia_nu/src/main/resources/static/smhi/responsive.css +++ /dev/null @@ -1,74 +0,0 @@ -/** Anpassningar för tryck, osv. */ -.mobile-only { - display: none; -} - -@media only print { - .onlyprint { - display: block; - } - .onlyscreen { - display: none !important; - } - body { - font-family: 'Liberation', 'Times', Serif !important; - } - header { - display: none; - } - a { - color: #000 !important; - text-decoration: none; - } - figure, blockquote, p, section#footnotes { - page-break-inside: avoid; - } - abbr { - text-decoration: none; - } -} -@media only screen { - .onlyprint { - display: none; - } - .onlyscreen { - display: block; - } -} -@media only screen and (max-device-width: 480px) { - article { - margin: -0.5em !important; - padding: 0.5em !important. - display: block; - font-size: 10pt; - } - .title { - padding-left: 0.5em; - padding-right: 0.5em; - } - - body.essä article p::before { - display: none; - } - .mobile-only { - display: auto; - } -} - -@media only screen and (max-device-width: 480px) { - blockquote { - padding: 0em; - border: none; - } -} - -@media only screen and (min-device-width: 640px) { - header, footer, article { - margin-left: 8em; - margin-right: 8em; - } - body,blockquote { - font-size: 14pt; - } -} - diff --git a/marginalia_nu/src/main/resources/static/smhi/style.css b/marginalia_nu/src/main/resources/static/smhi/style.css deleted file mode 100644 index 126019a9..00000000 --- a/marginalia_nu/src/main/resources/static/smhi/style.css +++ /dev/null @@ -1,192 +0,0 @@ -.smhi-snabbhopp { - font-size: 18pt; - font-family: 'Latin Modern', 'Liberation', 'Times', Serif !important; - font-weight: bold; - text-decoration: none; - margin: 1ch; -} - -.smhi-snabbhoplista { - display: flex; - flex-wrap: wrap; -} - -div:target .smhi-platslank { - font-weight: bold; -} -@media only screen and (min-device-width: 1024px) { - .smhi-snabbhoplista { - display: none !important; - } -} -@media only screen and (max-device-width: 480px) { - - .smhi-snabbhopp { - font-family: 'Times', Serif !important; - padding: 1ch !important; - margin: 1ch !important; - } - - .smhi-platslank { - font-size: 14pt; - padding-bottom: 2ch !important; - } - -} - - - -/** MARGINALER */ - -body { - background-color: #f8f8ee; - font-family: 'Tahoma', sans-serif; - text-rendering: optimizeLegibility; - margin: 0 auto; - max-width: 80ch; - -} -article { - background-color: #f8f8ee; - margin: -2em; - padding: 2em; -} - -header, footer, article { - margin-left: 4em; - margin-right: 4em; - display: block; -} - -a { - color: #274fa5; -} -a.replyButton, a:visited.replyButton { - color: #274fa5; - float: right; - text-decoration: none; -} -.headline { - color: #a5274f; -} - -details { - border-left: 1px solid #ccc; - font-size: 12pt; -} - -/** HEADER */ -header { - padding-bottom: 0.5em; - margin-bottom: 1em; -} - -header a { - text-decoration: none; -} - -a:visited { - color: #14114f; -} - -article { - -webkit-hyphens: auto; - -moz-hyphens: auto; - -ms-hyphens: auto; - -o-hyphens: auto; - line-height: 1.6; -} - -h1, h2, h3 { - font-family: 'Garamond', 'Palatino', serif; - font-weight: normal; - text-align: left; - color: #342a00; - margin-left: -.5em; -} - -.title { - text-align: center; - font-family: 'Garamond', 'Palatino', serif; - color: #342a00; -} -.title h1, .title h2, .title h3 { - font-family: 'Garamond', 'Palatino', serif; - font-weight: normal; - text-align: center; - color: #342a00; - padding-left: 2em; - padding-right: 2em; - border: none !important; - margin-left: inherit; -} - - -h1 { - border-bottom: 3px double #14114f; -} - - -h2, h3 { - border-bottom: 1px solid #14114f; -} - -.noline { - border-bottom: none !important; -} - -dt a, dd a { - text-align: left !important; -} - -section#footnotes { - font-size: 10pt; -} - -/** FOOTER */ - -footer { - padding-top: 0.5em; - margin-top: 3em; - color: #444; - line-height: 2; - text-align: center; -} - -footer section#signatur.special { - display: none; -} - -/** CITAT */ - -q { - font-family: 'Latin Modern', 'Garamond', serif; - color: #444; -} -blockquote { - color: #444; - font-family: 'Latin Modern', 'Garamond', serif; - font-size: 12pt; -} - -blockquote.verse { - white-space: pre; - font-size: 10pt; - line-height: 1.2; - padding: 1em; - margin-left: -1em; - margin-right: -1em; - overflow: auto; -} -cite { - text-align: center; - display: block; -} - -.teknisk { - font-family: 'fixedspace', monospace; -} -.deemph { - color: #886; - font-family: 'fixedspace', monospace; -} diff --git a/marginalia_nu/src/main/resources/static/style.css b/marginalia_nu/src/main/resources/static/style.css deleted file mode 100644 index 87d93408..00000000 --- a/marginalia_nu/src/main/resources/static/style.css +++ /dev/null @@ -1,246 +0,0 @@ -/** MARGINALER */ - -body { - background-color: #f8f8ee; - font-family: 'Tahoma', sans-serif; - text-rendering: optimizeLegibility; - margin: 0 auto; - max-width: 80ch; - -} - -article { - background-color: #f8f8ee; - margin: -2em; - padding: 2em; -} - -body.utkast { - background: url('/images/utkast-bg.webp') repeat-y left top; -} - -header, footer, article { - margin-left: 4em; - margin-right: 4em; - display: block; -} - -a { - color: #274fa5; -} -a.replyButton, a:visited.replyButton { - color: #274fa5; - float: right; - text-decoration: none; -} -.headline { - color: #a5274f; -} - -details { - border-left: 1px solid #ccc; - font-size: 12pt; -} - -/** HEADER */ -header { - padding-bottom: 0.5em; - margin-bottom: 1em; -} - -header a { - text-decoration: none; -} - -a:visited { - color: #14114f; -} - -/** ARTIKEL */ - -article { - text-align: justify; - -webkit-hyphens: auto; - -moz-hyphens: auto; - -ms-hyphens: auto; - -o-hyphens: auto; - line-height: 1.6; -} - -h1, h2, h3 { - font-family: 'Garamond', 'Palatino', serif; - font-weight: normal; - text-align: left; - color: #342a00; - margin-left: -.5em; -} - -.title { - text-align: center; - font-family: 'Garamond', 'Palatino', serif; - color: #342a00; -} -.title h1, .title h2, .title h3 { - font-family: 'Garamond', 'Palatino', serif; - font-weight: normal; - text-align: center; - color: #342a00; - padding-left: 2em; - padding-right: 2em; - border: none !important; - margin-left: inherit; -} - - -h1 { - border-bottom: 3px double #14114f; -} - - -h2, h3 { - border-bottom: 1px solid #14114f; -} - -.noline { - border-bottom: none !important; -} - -dt a, dd a { - text-align: left !important; -} - -section#footnotes { - font-size: 10pt; -} - -/** FOOTER */ - -footer { - padding-top: 0.5em; - margin-top: 3em; - color: #444; - line-height: 2; - text-align: center; -} - -footer section#signatur.special { - display: none; -} - -/* monogram */ -footer img { - text-align: center; - display: block; - margin-left: auto; - margin-right: auto; - margin-top: 4em; - width: 50%; - height: 50%; - opacity: 0.5; -} - - -/** CITAT */ - -q { - font-family: 'Latin Modern', 'Garamond', serif; - color: #444; -} -blockquote { - color: #444; - font-family: 'Latin Modern', 'Garamond', serif; - font-size: 12pt; -} - -blockquote.verse { - white-space: pre; - font-size: 10pt; - line-height: 1.2; - padding: 1em; - margin-left: -1em; - margin-right: -1em; - overflow: auto; -} -cite { - text-align: center; - display: block; -} - -.teknisk { - font-family: 'fixedspace', monospace; -} -.deemph { - color: #886; - font-family: 'fixedspace', monospace; -} - -/** Anpassningar för tryck, osv. */ - -@media only print { - .onlyprint { - display: block; - } - .onlyscreen { - display: none; - } - body { - font-family: 'Liberation', 'Times', Serif !important; - } - header { - display: none; - } - a { - color: #000 !important; - text-decoration: none; - } - figure, blockquote, p, section#footnotes { - page-break-inside: avoid; - } - abbr { - text-decoration: none; - } -} -@media only screen { - .onlyprint { - display: none; - } - .onlyscreen { - display: block; - } -} -@media only screen and (max-device-width: 480px) { - nav a { - display: block; - margin-bottom: 1.5em; - margin-top: 1.5em; - } - - header, footer, article { - margin-left: 2em; - margin-right: 2em; - display: block; - } - - body.essä article p::before { - display: none; - } -} - -@media only screen and (max-device-width: 480px) { - blockquote { - padding: 0em; - border: none; - } -} - -@media only screen and (min-device-width: 640px) { - header, footer, article { - margin-left: 8em; - margin-right: 8em; - } - body,blockquote,blockquote.verse { - font-size: 14pt; - } -} - - diff --git a/marginalia_nu/src/main/resources/templates/encyclopedia/wiki-error.hdb b/marginalia_nu/src/main/resources/templates/encyclopedia/wiki-error.hdb deleted file mode 100644 index 37376677..00000000 --- a/marginalia_nu/src/main/resources/templates/encyclopedia/wiki-error.hdb +++ /dev/null @@ -1,28 +0,0 @@ - - - - - Error - - - - - -
- -
-
-

An error has occurred!

-

- Either the page you attempted to access does not exist, - or the automatic cleaning has process failed. -

-

- Please use this link as a back-up:
- {{.}} -

-
- \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/encyclopedia/wiki-search.hdb b/marginalia_nu/src/main/resources/templates/encyclopedia/wiki-search.hdb deleted file mode 100644 index bc5d5a12..00000000 --- a/marginalia_nu/src/main/resources/templates/encyclopedia/wiki-search.hdb +++ /dev/null @@ -1,35 +0,0 @@ - - - - - Encyclopedia Search: {{query}} - - - - - -
- -
-
-

Search the Encyclopedia

- -

Search results

- {{#if error}} -
Failed to find exact article match
- {{/if}} -
- {{#each results}} -
{{name}}
- {{#if refName}}
{{refName}}
{{/if}} - {{/each}} -
-
- \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/smhi/index.hdb b/marginalia_nu/src/main/resources/templates/smhi/index.hdb deleted file mode 100644 index 556d8e7e..00000000 --- a/marginalia_nu/src/main/resources/templates/smhi/index.hdb +++ /dev/null @@ -1,44 +0,0 @@ - - - - {{title}} - - - - - - - - -
- -
- - - - diff --git a/marginalia_nu/src/main/resources/templates/smhi/prognos.hdb b/marginalia_nu/src/main/resources/templates/smhi/prognos.hdb deleted file mode 100644 index 274a6fce..00000000 --- a/marginalia_nu/src/main/resources/templates/smhi/prognos.hdb +++ /dev/null @@ -1,73 +0,0 @@ - - - - Väderprognos för {{plats.namn}} - - - - - - - - - - -
- -
-
-

{{plats.namn}}

- - {{#each dygn}} - - - - - - - - - - - {{#each data}} - - - - - - - - {{/each}} - - {{/each}} -
{{date}}{{{veckodag}}}
TidTempVindNeder.Moln
{{time}}{{temp}}{{vind}} ({{byvind}}){{nederbord}} {{nederbordTyp}}{{moln}}
- - -

Förklaring

-

Molntäcke (Moln.) visas på en skala 0-8, där höga värden indikerar - tjockt molntäcke, och låga värden indikerar blåare skyar.

- -

Nederbörd (Neder.) indikeras med förkortningar: - - - - - - - -
SSnö
SBSnöblandat regn
RRegn
DDimma
UKRUnderkylt regn
UKDUnderkyld dimma
-

- -

Källa SMHI

-

- All prognosdata hämtas från SMHI:s öppna API:er, under licensen - Creative Commons Erkännande 2.5. - Bäst före {{bastFore}}. -

-
- - - diff --git a/marginalia_nu/src/main/resources/templates/status/server-status.hdb b/marginalia_nu/src/main/resources/templates/status/server-status.hdb deleted file mode 100644 index e8217101..00000000 --- a/marginalia_nu/src/main/resources/templates/status/server-status.hdb +++ /dev/null @@ -1,25 +0,0 @@ - - - - Server Status - - - - - -
- -
-
-

Server Status

- - {{#each status}} -

- {{server}} - {{status}} -

- {{/each}} - -
- - - diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/SeekDictionaryTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/SeekDictionaryTest.java deleted file mode 100644 index 1987da6f..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/util/SeekDictionaryTest.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.util; - -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -class SeekDictionaryTest { - - @Test - public void testSeek() { - var dict = SeekDictionary.of((int[] x) -> x.length); - - for (int i = 0; i < 10000;) { - int j = (int)(1 + 9 * Math.random()); - int[] block = new int[j]; - for (int k = 0; k < j; k++) { - block[k] = i+k; - } - dict.add(block); - i+=j; - } - - o: for (int i = 0; i < 10000; i++) { - int[] vals = dict.bankForOffset(i); - for (var v : vals) { - if (v == i) continue o; - } - Assertions.fail("Could not find " + i); - } - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/HostsFileTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/HostsFileTest.java deleted file mode 100644 index cd6c24f1..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/HostsFileTest.java +++ /dev/null @@ -1,69 +0,0 @@ -package nu.marginalia.wmsa.configuration; - -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.BeforeEach; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -import static org.junit.jupiter.api.Assertions.assertThrows; - -class HostsFileTest { - Path tempFile; - - @BeforeEach - public void setUp() throws IOException { - tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp"); - } - - @AfterEach - public void tearDown() throws IOException { - tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp"); - } - - @Test - public void testParseSunnyDay() throws IOException { - Files.writeString(tempFile, """ - # Comment - edge-index 192.168.0.1 - edge-search 192.168.1.1 - - auth 127.0.0.55 - - - """); - var hf = new HostsFile(tempFile); - - Assertions.assertEquals("192.168.0.1", hf.getHost(ServiceDescriptor.EDGE_INDEX)); - } - - @Test - public void testTooLong() throws IOException { - Files.writeString(tempFile, """ - edge-index 192.168.0.1 this is where my homie lives - """); - - assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile)); - } - - @Test - public void testTooShort() throws IOException { - Files.writeString(tempFile, """ - edge-index - """); - - assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile)); - } - - @Test - public void testBadName() throws IOException { - Files.writeString(tempFile, """ - garum-factory 127.0.0.1 - """); - - new HostsFile(tempFile); - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/suggest/SuggestionsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/suggest/SuggestionsTest.java deleted file mode 100644 index 7184c8b9..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/suggest/SuggestionsTest.java +++ /dev/null @@ -1,43 +0,0 @@ -package nu.marginalia.wmsa.edge.assistant.suggest; - -import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import java.nio.file.Path; -import java.util.List; - -class SuggestionsTest { - private static Suggestions suggestions; - - @BeforeAll - public static void setUp() { - LanguageModels lm = TestLanguageModels.getLanguageModels(); - suggestions = new Suggestions(Path.of("/home/vlofgren/Work/sql-titles-clean"), - new SpellChecker(), new TermFrequencyDict(lm)); - } - - @Test - @Disabled - void getSuggestions() { - System.out.println(tryGetSuggestions("neop")); - System.out.println(tryGetSuggestions("neopla")); - System.out.println(tryGetSuggestions("middle p")); - System.out.println(tryGetSuggestions("new public mana")); - System.out.println(tryGetSuggestions("euse")); - } - - List tryGetSuggestions(String s) { - long start = System.currentTimeMillis(); - try { - return suggestions.getSuggestions(10, s); - } - finally { - System.out.println(System.currentTimeMillis() - start); - } - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/PrimeUtilTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/PrimeUtilTest.java deleted file mode 100644 index 703ed8cd..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/PrimeUtilTest.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service.util; - -import nu.marginalia.util.PrimeUtil; -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -class PrimeUtilTest { - - @Test - void isPrime() { - assertTrue(PrimeUtil.isPrime(1)); - assertTrue(PrimeUtil.isPrime(2)); - assertTrue(PrimeUtil.isPrime(3)); - assertFalse(PrimeUtil.isPrime(4)); - assertTrue(PrimeUtil.isPrime(5)); - assertFalse(PrimeUtil.isPrime(6)); - assertTrue(PrimeUtil.isPrime(7)); - assertFalse(PrimeUtil.isPrime(8)); - assertFalse(PrimeUtil.isPrime(9)); - assertFalse(PrimeUtil.isPrime(10)); - assertTrue(PrimeUtil.isPrime(11)); - } - - @Test - void nextPrime() { - System.out.println(PrimeUtil.nextPrime(1L<<31, -1)); - System.out.println(PrimeUtil.nextPrime(1L<<31, 1)); - - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java deleted file mode 100644 index e6e146ee..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java +++ /dev/null @@ -1,41 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.arxiv; - -import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.util.language.processing.model.KeywordMetadata; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.integration.arxiv.model.ArxivMetadata; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import java.io.File; -import java.io.IOException; - -@Disabled // this isn't used and the test is hella slow -class ArxivParserTest { - final LanguageModels lm = TestLanguageModels.getLanguageModels(); - - @Test - void parse() throws IOException { - var parser = new ArxivParser(); - var data = parser.parse(new File("/home/vlofgren/Work/arxiv/arxiv-metadata-oai-snapshot.json")); - - data.stream().map(ArxivMetadata::getAbstract).limit(100).forEach(System.out::println); - } - - @Test - void extractKeywords() throws IOException { - var dict = new TermFrequencyDict(lm); - - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); - - var parser = new ArxivParser(); - var data = parser.parse(new File("/home/vlofgren/Work/arxiv/arxiv-metadata-oai-snapshot.json")); - - var se = new SentenceExtractor(lm); - - data.stream().map(meta -> documentKeywordExtractor.extractKeywords(se.extractSentences(meta.getAbstract(), meta.getTitle()), new KeywordMetadata())).limit(100).forEach(System.out::println); - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java deleted file mode 100644 index 6971d9b7..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java +++ /dev/null @@ -1,47 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.stackoverflow; - -import nu.marginalia.util.ParallelPipe; -import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; -import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.xml.sax.SAXException; - -import javax.xml.parsers.ParserConfigurationException; - -public class StackOverflowPostsTest { - final LanguageModels lm = TestLanguageModels.getLanguageModels(); - - @Test @Disabled("this is stupidly slow") - public void test() throws ParserConfigurationException, SAXException, InterruptedException { - var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm)); - - ThreadLocal processor = ThreadLocal.withInitial(() -> { - return new StackOverflowPostProcessor(new SentenceExtractor(lm), documentKeywordExtractor); - }); - - var pipe = new ParallelPipe("pipe", 10, 5, 2) { - @Override - public BasicDocumentData onProcess(StackOverflowPost stackOverflowPost) { - return processor.get().process(stackOverflowPost); - } - - @Override - public void onReceive(BasicDocumentData stackOverflowIndexData) { - System.out.println(stackOverflowIndexData.url); - } - }; - - var reader = new StackOverflowPostsReader("/mnt/storage/downloads.new/stackexchange/sites/philosophy/Posts.xml", new EdgeDomain("philosophy.stackexchange.com"), - pipe::accept); - reader.join(); - System.out.println("Waiting for pipe"); - pipe.join(); - } -} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java deleted file mode 100644 index 40a58e93..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java +++ /dev/null @@ -1,73 +0,0 @@ -package nu.marginalia.wmsa.edge.integration.wikipedia; - -import lombok.SneakyThrows; -import nu.marginalia.util.ParallelPipe; -import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.util.language.DocumentDebugger; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.sentence.SentenceExtractor; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; -import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import org.jsoup.Jsoup; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; - -import java.io.IOException; - -@Tag("slow") -public class WikipediaTest { - final LanguageModels lm = TestLanguageModels.getLanguageModels(); - - @Test @SneakyThrows - public void test() { - var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm)); - ThreadLocal processor = ThreadLocal.withInitial(() -> { - return new WikipediaProcessor(new SentenceExtractor(lm), documentKeywordExtractor); - }); - - var pipe = new ParallelPipe("pipe", 10, 5, 2) { - @Override - public BasicDocumentData onProcess(WikipediaArticle stackOverflowPost) { - return processor.get().process(stackOverflowPost); - } - - @Override - public void onReceive(BasicDocumentData indexData) { - System.out.println(indexData.url); - System.out.println(indexData.title); - System.out.println(indexData.description); - } - }; - - var reader = new WikipediaReader("/home/vlofgren/Work/wikipedia_en_100_nopic_2021-06.zim", new EdgeDomain("encyclopedia.marginalia.nu"), - pipe::accept); - reader.join(); - } - - - @Test @SneakyThrows - public void test2() { - var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm)); - var debugger = new DocumentDebugger(lm); - - ThreadLocal processor = ThreadLocal.withInitial(() -> { - return new WikipediaProcessor(new SentenceExtractor(lm), documentKeywordExtractor); - }); - - var reader = new WikipediaReader("/home/vlofgren/Work/wikipedia_en_100_nopic_2021-06.zim", new EdgeDomain("encyclopedia.marginalia.nu"), - article -> { - try { - debugger.debugDocument(article.url.getPath(), Jsoup.parse(article.body)); - - } catch (IOException e) { - e.printStackTrace(); - } - }); - - reader.join(); - debugger.writeIndex(); - } -} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/podcasts/PodcastFetcherTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/podcasts/PodcastFetcherTest.java deleted file mode 100644 index 0033464e..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/podcasts/PodcastFetcherTest.java +++ /dev/null @@ -1,15 +0,0 @@ -package nu.marginalia.wmsa.podcasts; - -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -class PodcastFetcherTest { - - @Test - void fetchPodcast() { - var result = new PodcastFetcher().fetchPodcast("hopwag", "https://rss.acast.com/readmeapoem"); - assertTrue(result.isPresent()); - System.out.println(result); - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/resources/log4j2.properties b/marginalia_nu/src/test/resources/log4j2.properties deleted file mode 100644 index 9c2dbefd..00000000 --- a/marginalia_nu/src/test/resources/log4j2.properties +++ /dev/null @@ -1,15 +0,0 @@ - -status = info - -appender.console.type = Console -appender.console.name = LogToConsole -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg%n - -logger.console.name = nu.marginalia -logger.console.level = debug -logger.console.additivity = false -logger.console.appenderRef.rolling.ref = LogToConsole - -rootLogger.level = info -rootLogger.appenderRef.console.ref = LogToConsole diff --git a/marginalia_nu/src/test/resources/model-data.json b/marginalia_nu/src/test/resources/model-data.json deleted file mode 100644 index 063b93e7..00000000 --- a/marginalia_nu/src/test/resources/model-data.json +++ /dev/null @@ -1 +0,0 @@ -{"comments":[{"id":{"value":"t1_gku7btj"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-26 17:35:16","author":"TheEmpathyBox","body":"Hello,\n\nHow would you translate in latin \" *unreliable narrator*\" ?","sequenceNumber":2,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":596601000}}},{"id":{"value":"t1_gkudywj"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gku7btj"},"distinguished":false,"created_utc":"21-01-26 18:23:03","author":"kc_kennylau","body":"Literally it would be \"narr?tor ?nfid?lis\", but obviously this is a recent term ([Wiktionary](https://en.wiktionary.org/wiki/unreliable_narrator) says coined in 1961) that does not have an immediate equivalence to Roman literature that comes to mind.","sequenceNumber":3,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":597256000}}},{"id":{"value":"t1_gkxfa9w"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkudywj"},"distinguished":false,"created_utc":"21-01-27 09:46:25","author":"BobTheSCV","body":"Seems strange there wouldn\u0027t be a term for this. Unreliable narrators go back as far as Homer with the Odyssey.\n\nEvery part of the story Odysseus narrates is full of fantastical monsters and extremely unlikely events; a complete break from the story as narrated by the voice of Homer which is more in line with the style of the Illiad, a lot more down to earth except for minor interventions by the gods.\n\nAs to make a point, the guy is shown compulsively deceiving every single person he meets: Gods, beasts, enemies, allies alike. Why *wouldn\u0027t* he deceive the audience as well?","sequenceNumber":4,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":597341000}}},{"id":{"value":"t1_gkuf0n7"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gku7btj"},"distinguished":false,"created_utc":"21-01-26 18:30:35","author":"BaconJudge","body":"I\u0027d convey that idea with an adjective meaning \"untrustworthy,\" either *narrator infidus* or *narrator infidelis*, the former possibly helping to avoid the secondary religious sense of the latter.\n\nThe noun *narrator* traditionally existed only in masculine form, but the explicitly feminine version *narratrix* appears in newer references like the Vatican\u0027s *Lexicon Recentis Latinitatis* and Rene Hoeven\u0027s *Lexique de la Prose Latine de la Renaissance,* so for a female character you\u0027d have the option of *narratrix infida* (note the change at the end of the adjective) or *narratrix infidelis*, if you wanted.","sequenceNumber":5,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":597415000}}},{"id":{"value":"t1_gkug4j7"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gku7btj"},"distinguished":false,"created_utc":"21-01-26 18:38:25","author":"lutetiensis","body":"*incredibilis narrator.*\n\n[*incredibilis*](https://logeion.uchicago.edu/incredibilis) \u003d *in* \\+ [*credibilis*](https://logeion.uchicago.edu/credibilis), from [*credere*](https://logeion.uchicago.edu/credo), to believe, to intrust.","sequenceNumber":6,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":597483000}}},{"id":{"value":"t1_gkue6uo"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gku7btj"},"distinguished":false,"created_utc":"21-01-26 18:24:37","author":"EgoSumInHorto","body":"\"*Narr?tor viti?sissimus*\"","sequenceNumber":7,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":597550000}}},{"id":{"value":"t1_gkv6xf8"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkue6uo"},"distinguished":false,"created_utc":"21-01-26 21:39:47","author":"Tharadin1970","body":"Wouldnt that be more \"a very vice-ful narrator\"?","sequenceNumber":8,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":597617000}}},{"id":{"value":"t1_gkv76xq"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkv6xf8"},"distinguished":false,"created_utc":"21-01-26 21:41:21","author":"EgoSumInHorto","body":"I couldn\u0027t find a word for \"unreliable\"; \"viti?sus\" means \"full of faults, corrupt, vicious, morally faulty, defective\", hence \"unreliable\"","sequenceNumber":9,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":597684000}}},{"id":{"value":"t1_gkv7rzf"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkv76xq"},"distinguished":false,"created_utc":"21-01-26 21:45:06","author":"lutetiensis","body":"\u003e \"unreliable\"\n\nIncredibilis (see my comment).","sequenceNumber":10,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":597750000}}},{"id":{"value":"t1_gkv848d"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkv7rzf"},"distinguished":false,"created_utc":"21-01-26 21:47:13","author":"EgoSumInHorto","body":"That should have been a pretty obvious derivation to make... Doh!\nThanks :)","sequenceNumber":11,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":597823000}}},{"id":{"value":"t1_gkup1zh"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-26 19:39:00","author":"Gopnikcykablyat","body":"How do you say \"Hope never dies, because hope is the killer\" ?","sequenceNumber":12,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":597887000}}},{"id":{"value":"t1_gkuvwow"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkup1zh"},"distinguished":false,"created_utc":"21-01-26 20:26:06","author":"lutetiensis","body":"For stylistic reasons, I would render it as:\n\n*spes non decedit sed caedit.*\n\nHope doesn\u0027t die, but kills. You can replace *non* with *numquam* (\"never\") *sed* with *quia* (\"because \\[it\\]\").","sequenceNumber":13,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":597954000}}},{"id":{"value":"t1_gkuqvlo"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkup1zh"},"distinguished":false,"created_utc":"21-01-26 19:51:09","author":"BluuDuud","body":"Id say, \"sp?s numquam moritur, nam sp?s nec?tor est\"","sequenceNumber":14,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598026000}}},{"id":{"value":"t1_gkvrzwi"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkuqvlo"},"distinguished":false,"created_utc":"21-01-27 00:08:45","author":"magistramegaera","body":"Should it be necatrix instead of necator, since spes is feminine? Or does that not really matter with an abstract concept like this?","sequenceNumber":15,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598092000}}},{"id":{"value":"t1_gl60nlx"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkvrzwi"},"distinguished":false,"created_utc":"21-01-29 01:01:50","author":"Sochamelet","body":"I wouldn\u0027t say it\u0027s wrong per se to use *necator*, but in my experience, Roman authors were generally inclined to preserve correspondences between the gender of words, if possible.","sequenceNumber":16,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598158000}}},{"id":{"value":"t1_gkwj57n"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkvrzwi"},"distinguished":false,"created_utc":"21-01-27 03:48:35","author":"BluuDuud","body":"I forgot that, I think you\u0027re correct","sequenceNumber":17,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598223000}}},{"id":{"value":"t1_gkzsh08"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkwj57n"},"distinguished":false,"created_utc":"21-01-27 21:15:26","author":"glaraaaaaaah","body":"No it doesn?t, _necator_ is a masculine noun not an adjective, so it doesn?t need to agree. Unless you want to specify that hope is female, male-gendered words are more common for abstract concepts I think","sequenceNumber":18,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598287000}}},{"id":{"value":"t1_gkuthtv"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-26 20:09:24","author":"Youngerthandumb","body":"There was a quote from a Scandanavian bishop who, on his deathbed suffering from intense pains, cried out \"Do not pray me out of god\u0027s battle!\" when his colleagues gathered round to pray for his recovery. I thought that was kind of metal. How would that translate to Latin?","sequenceNumber":19,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598351000}}},{"id":{"value":"t1_gkv2ymf"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkuthtv"},"distinguished":false,"created_utc":"21-01-26 21:13:38","author":"nimbleping","body":"*M? (mihi (?)) ? pugn? De? n?n d?prec?re/d?prec?min?* (sg./pl. addressees).\n\n(Do not intercede by prayer on behalf of me away from the battle of God.)\n\nEDIT: I\u0027m not 100% sure if the accusative *m?* or the dative *mihi* should be used here. I\u0027d who knows to offer an opinion. I figure the ablative of motion away from would be appropriate here.","sequenceNumber":20,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598419000}}},{"id":{"value":"t1_gkv8czl"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkv2ymf"},"distinguished":false,"created_utc":"21-01-26 21:48:49","author":"Youngerthandumb","body":"Thank you I appreciate it! I took 2 years of high school latin but I don\u0027t trust myself to translate anything beyond \"Caecilius ad venit\".","sequenceNumber":21,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598484000}}},{"id":{"value":"t1_gkv9lku"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-26 21:57:03","author":"XENO-BLAZE","body":"How would you translate:\n\nAllow me to impress upon you the severe mistake you have made. For years my conduct has been largely benign. And yet, without provocation, you have severed our détente and forced me to unleash upon you the vengeful flames of a thousand suns. You shall curse your mother for the day of your birth. So, go now, go, and begin your life of fear, knowing that when you least expect it, the looming sword of Damocles will crash upon you, cleaving you in twain and as you gaze upon the smoking wreckage that was once your life, you will regret the day you crossed me","sequenceNumber":22,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598545000}}},{"id":{"value":"t1_gkve8fd"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-26 22:28:44","author":"iillltt","body":"Hello! I\u0027m not sure if this fits here but I was wondering if anyone would be able to identify/translate what the chant in the beginning of [this song](https://youtu.be/3oUUG7Mfoc4) is. I think it\u0027s in Latin so forgive me if it\u0027s not","sequenceNumber":23,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598610000}}},{"id":{"value":"t1_gkvkja8"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkve8fd"},"distinguished":false,"created_utc":"21-01-26 23:12:05","author":"lutetiensis","body":"That\u0027s hard... *regum satus*? *sanctus?* Do you know what they sampled?","sequenceNumber":24,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598686000}}},{"id":{"value":"t1_gkvm21e"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkvkja8"},"distinguished":false,"created_utc":"21-01-26 23:23:51","author":"iillltt","body":"unfortunately not, i\u0027ve heard other people say it\u0027s \u0027Spiritus Sanctus\u0027 but i haven\u0027t been able to find the original sample- same with regum satus which doesn\u0027t have any search results. thank you so much as well","sequenceNumber":25,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598753000}}},{"id":{"value":"t1_gkvmwyy"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkvm21e"},"distinguished":false,"created_utc":"21-01-26 23:30:26","author":"lutetiensis","body":"Ok. Sorry. I don\u0027t think I can do more on this one.","sequenceNumber":26,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598819000}}},{"id":{"value":"t1_gkvnbr3"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkvmwyy"},"distinguished":false,"created_utc":"21-01-26 23:33:31","author":"iillltt","body":"thank you so much for trying!!! i\u0027ll be questioning producers where they get samples from next haha","sequenceNumber":27,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598880000}}},{"id":{"value":"t1_gkvm2ak"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkve8fd"},"distinguished":false,"created_utc":"21-01-26 23:23:54","author":"ragnrikr","body":"Can\u0027t really help (to my ears it sounds like (sectum/secum) (satu/sato) I.e. gibberish), just wanted to point out this post https://www.reddit.com/r/kpophelp/comments/g49xob/latin_in_gottasadae/\n\nNo source sadly :/","sequenceNumber":28,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":598944000}}},{"id":{"value":"t1_gkvn3fo"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkvm2ak"},"distinguished":false,"created_utc":"21-01-26 23:31:47","author":"iillltt","body":"Yep :( \nI was reminded of this question when someone asked a similar question as to your linked post so it still hasn\u0027t been answered. well on the bright side I got to share a nice song and the mystery will remain unsolved ...","sequenceNumber":29,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599016000}}},{"id":{"value":"t1_gkw3c5e"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-27 01:38:08","author":"luisafonsoteixeira","body":"I\u0027ve recently came across the US Navy Academy saying \"Ex Scientia Tridens\", ?Through Knowledge, Sea Power?. How could one correctly say \"through knowledge, power\"?","sequenceNumber":30,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599082000}}},{"id":{"value":"t1_gkwzw4j"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkw3c5e"},"distinguished":false,"created_utc":"21-01-27 06:24:34","author":"lutetiensis","body":"\u003epower\n\n[Potentia](https://logeion.uchicago.edu/potentia), [imperium](https://logeion.uchicago.edu/imperium), [fortitudo](https://logeion.uchicago.edu/fortitudo)...\n\nNote the original motto is poetic. It doesn\u0027t say \"sea power\", but instead \"\\[Neptune\u0027s\\] trident\". You could find an artifact that would represent power for you (a sword? a crown?).","sequenceNumber":31,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599146000}}},{"id":{"value":"t1_gkw4obz"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-27 01:48:56","author":"Eldonith","body":"Help me fix a cringe tattoo!\n\nBackground: I got a tattoo back when the emo craze was big and Hot Topic was one of the most popular stores in the mall about 12 years ago. It reads \"Nascentes Morimur\" which roughly translated to \"When we are born we begin to die.\" It seemed cool at the time in my addled 18 year old mind, but I\u0027m a family man now and not only does it no longer represent my mindset. but it\u0027s even embarrassing to explain.\n\nWhat line (in Latin) would you suggest I add to brighten it up? Please include a translation, as my highschool Latin days are far behind me. Thanks!","sequenceNumber":32,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599207000}}},{"id":{"value":"t1_gkxz2rq"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkw4obz"},"distinguished":false,"created_utc":"21-01-27 14:11:03","author":"BaconJudge","body":"Given the constraint that it\u0027ll contain a reference to dying, is there any particular sentiment you want to convey? Because you\u0027re a family man, maybe you could expand it to something like *Amati Nascentes, Morimur Amati\" (inserting the optional comma if there\u0027s room) to imply roughly \"Born loved, we die loved.\"","sequenceNumber":33,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599267000}}},{"id":{"value":"t1_gl1ij4p"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkxz2rq"},"distinguished":false,"created_utc":"21-01-28 04:12:22","author":"Eldonith","body":"Wow that\u0027s beautiful and exactly the kinda sentiment I\u0027d like it changed to! I love how you transformed it with a word added to the beginning and end instead of a whole 2nd line. I may very well end up going with this unless anybody can top that suggestion.","sequenceNumber":34,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599334000}}},{"id":{"value":"t1_gl83w6w"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl1ij4p"},"distinguished":false,"created_utc":"21-01-29 12:52:05","author":"BaconJudge","body":"Happy to help, and I hope the tattoo change works out for you.","sequenceNumber":35,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599401000}}},{"id":{"value":"t1_gkwk2sa"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-27 03:56:29","author":"reds3232","body":" mater servum vituperavit","sequenceNumber":36,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599464000}}},{"id":{"value":"t1_gkwlsx3"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-27 04:11:09","author":"megtheedemon","body":"How do you say ?I would rather be studying latin?","sequenceNumber":37,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599524000}}},{"id":{"value":"t1_gkxgp1b"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkwlsx3"},"distinguished":false,"created_utc":"21-01-27 10:07:50","author":"kc_kennylau","body":"Lat?nae linguae stud?re m?l?","sequenceNumber":38,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599586000}}},{"id":{"value":"t1_gkx6rax"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-27 07:45:53","author":"ResidentGift","body":"In a game I\u0027m playing, there are characters named Alatus, Bosacius, Indarias, Bonanus, and Menogias. I\u0027m pretty sure Alatus is the Latin word for \"*winged*\" (or at least related to \"*wing*\") and it also suits the character\u0027s motif. The other four names sound Latin-ish, but I can\u0027t find anything on them. Can anyone confirm if the other four names are Latin or rooted in Latin? If yes, how would they be translated?","sequenceNumber":39,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599646000}}},{"id":{"value":"t1_gkxwqf0"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkx6rax"},"distinguished":false,"created_utc":"21-01-27 13:45:42","author":"BaconJudge","body":"You\u0027re right about Alatus, but I don\u0027t think the others are Latin-derived. Bosacius could have been loosely inspired by words like *boscis* (\"waterfowl\") or Medieval Latin *boscus* (\"woods\") if either of those makes sense for the character. If Bonanus is the good guy, the name might be inspired by the very common Latin adjective *bonus* (\"good\"). Words or names ending in -as are likely to be Greek rather than Latin, and I don\u0027t recognize any promising Latin roots for those two unless Indarias is from India, which is the same in Latin.","sequenceNumber":40,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599714000}}},{"id":{"value":"t1_gl0mrhe"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkxwqf0"},"distinguished":false,"created_utc":"21-01-28 00:36:11","author":"ResidentGift","body":"Thank you for the help!","sequenceNumber":41,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599777000}}},{"id":{"value":"t1_gkxpfxd"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-27 12:17:05","author":"stickybeak7","body":"Hi there! Hoping to get a translation similar to \"memento mori\" but for \"remember you are loved\" for a valentines gift :o) thank you so much!","sequenceNumber":42,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599838000}}},{"id":{"value":"t1_gkxsyty"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkxpfxd"},"distinguished":false,"created_utc":"21-01-27 13:02:11","author":"aveCaecilius","body":"memento amari","sequenceNumber":43,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599899000}}},{"id":{"value":"t1_gkzyg27"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkxsyty"},"distinguished":false,"created_utc":"21-01-27 21:54:16","author":"stickybeak7","body":"thank you so much! ?","sequenceNumber":44,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":599965000}}},{"id":{"value":"t1_gkxpgd0"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-27 12:17:16","author":"pierro_la_place","body":"Hi there!\n\nI am building a mediaval-ish world in which the people of kingdom A really don\u0027t like kingdom B they are at war with, to the point that they refuse to pronounce its name. Instead they say someting along the lines of \"land of the rapists\", but in Latin to give it an almost religious tone. After a bit of research, the translation I was thinking about was \"terra stuparotes\", but I am not an expert in Latin so I would like to know what you think.\n\nThanks!","sequenceNumber":45,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600039000}}},{"id":{"value":"t1_gkxvjvf"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkxpgd0"},"distinguished":false,"created_utc":"21-01-27 13:32:32","author":"BaconJudge","body":"That\u0027s probably a typo for *stupratores*, which is the plural form of the word [*stuprator*](http://www.perseus.tufts.edu/hopper/text?doc\u003dPerseus%3Atext%3A1999.04.0059%3Aentry%3Dstuprator) when used as the subject of a sentence. Because you want it as a possessive plural (\"of the rapists\"), the phrase would be *terra stupratorum*.","sequenceNumber":46,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600100000}}},{"id":{"value":"t1_gkxypyz"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkxvjvf"},"distinguished":false,"created_utc":"21-01-27 14:07:19","author":"pierro_la_place","body":"Yup, I mixed up accusative and genitive. Good thing I asked! Thx.","sequenceNumber":47,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600161000}}},{"id":{"value":"t1_gkxzjnm"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkxypyz"},"distinguished":false,"created_utc":"21-01-27 14:15:50","author":"BaconJudge","body":"You\u0027re welcome, anytime.","sequenceNumber":48,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600231000}}},{"id":{"value":"t1_gkylxmm"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-27 16:55:07","author":"Koa1121","body":"How would you say ?Where easy becomes hard? or ?Where easy is hard?","sequenceNumber":49,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600292000}}},{"id":{"value":"t1_gl38i9x"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkylxmm"},"distinguished":false,"created_utc":"21-01-28 15:09:01","author":"quintus_sub_rosa","body":"in quo facile fit difficile.","sequenceNumber":50,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600351000}}},{"id":{"value":"t1_gl0zg1x"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-28 01:53:37","author":"GIGABIT","body":"What\u0027s up my dudes. WSB \"investor\" here. \n\nIn light of the ongoing GameStop insanity you might have heard about I thought it could be cool to get a tattoo celebrating the gains. Naturally it has to be the \"Power to the players\" slogan, but from what I understand, the word \"player\" isn\u0027t really a thing in Latin.\n\nI thought \"Power to the Gambler\" might be a more appropriate choice considering the nature of both parties involved, and because there actually might be a proper word for \"gambler\".\n\nSo I thought I\u0027d ask you guys for some advice about the sentence so I don\u0027t go printing myself with something stupid like \"short the market\".\n\nThanks in advance!","sequenceNumber":51,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600408000}}},{"id":{"value":"t1_gl29fog"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl0zg1x"},"distinguished":false,"created_utc":"21-01-28 08:38:37","author":"kc_kennylau","body":"potentia ad aleatores\n\nPS: don\u0027t trust internet strangers (such as me!) for tattoo","sequenceNumber":52,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600486000}}},{"id":{"value":"t1_gl33bxa"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl29fog"},"distinguished":false,"created_utc":"21-01-28 14:37:06","author":"GIGABIT","body":"Thanks a lot!\n\nI SHOULD know better than to trust internet strangers... Then again, you know what everyone is over at WSB, so..","sequenceNumber":53,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600548000}}},{"id":{"value":"t1_gl1qbxz"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-28 05:16:51","author":"dpm5150","body":"I want to mount a motto carved in wood in my library. I?m trying to figure out a good noun to express ?usefulness?. I?m want to express to my son that he should strive for contributing to society in ways that return tangible value. Usefulness, itself, is a dull word, so I?m not sure if Latin has something with a little more zing.","sequenceNumber":54,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600612000}}},{"id":{"value":"t1_gl29810"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl1qbxz"},"distinguished":false,"created_utc":"21-01-28 08:35:56","author":"kc_kennylau","body":"utilitas / ?tilit?s (whence English \"utility\")","sequenceNumber":55,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600672000}}},{"id":{"value":"t1_gl3020g"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl29810"},"distinguished":false,"created_utc":"21-01-28 14:13:35","author":"dpm5150","body":"Yes, this definitely can work. Now I have to decide how inspirational it is for a motto. Thank you so much.","sequenceNumber":56,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600732000}}},{"id":{"value":"t1_gl4ppso"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-28 20:03:46","author":"Rashed8StringVi","body":"From the popular phrase ?In Vino Veritas?, how would one correctly substitute ?blood? instead of ?wine? such that the phrase becomes ?in blood is the truth??","sequenceNumber":57,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600790000}}},{"id":{"value":"t1_gl54pgr"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl4ppso"},"distinguished":false,"created_utc":"21-01-28 21:34:34","author":"kc_kennylau","body":"in sanguine veritas","sequenceNumber":58,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600847000}}},{"id":{"value":"t1_gl55c0i"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-28 21:38:27","author":"coralcakes","body":"How would An Appeal to Heaven be translated into latin?","sequenceNumber":59,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600903000}}},{"id":{"value":"t1_gl5nhd2"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl55c0i"},"distinguished":false,"created_utc":"21-01-28 23:28:44","author":"jayzwasinnirvana","body":"*Obsecratio ad caelum* is one way. I\u0027m not sure if there is an existing phrase. I did not use *appellatio* because I think it\u0027s meaning in Latin is more strictly legal.","sequenceNumber":60,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":600978000}}},{"id":{"value":"t1_gl5quql"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl55c0i"},"distinguished":false,"created_utc":"21-01-28 23:52:04","author":"kc_kennylau","body":"Quite literally \"appellatio ad caelum\".","sequenceNumber":61,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601035000}}},{"id":{"value":"t1_gl57uz9"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-28 21:53:32","author":"RandyDautzenberg","body":"Hey all, \n\n\nCurrently I\u0027m looking for a translation of \u0027to the top\u0027 / \u0027the top\u0027 in Latin. Some internet translators are giving me different translations. Most of the time they translate it as \u0027ad summitatem\u0027 or \u0027ad verticem\u0027. \n\n\nI really like the word \u0027verticem\u0027, haha. So I was wondering if you can also use \u0027verticem\u0027 without the preposition \u0027ad\u0027. Or would that be grammatically incorrect? \n\n\nMany thanks! :)","sequenceNumber":62,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601091000}}},{"id":{"value":"t1_gl5khcv"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl57uz9"},"distinguished":false,"created_utc":"21-01-28 23:08:37","author":"kc_kennylau","body":"\"the top\" \u003d apex / vertex\n\n\"to the top\" \u003d ad apicem / ad verticem\n\nIt would be better to provide context (e.g. whole sentence).","sequenceNumber":63,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601147000}}},{"id":{"value":"t1_gl5lazb"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl5khcv"},"distinguished":false,"created_utc":"21-01-28 23:14:06","author":"RandyDautzenberg","body":"Many thanks for your quick reply! Well, I would like to use it as a brand name, haha. So that means that using verticem without the preposition is not grammatically correct, right?","sequenceNumber":64,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601203000}}},{"id":{"value":"t1_gl5avpy"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-28 22:11:03","author":"ContuberniumSPQR","body":"Hello \n\nI have to determine the word Casus from Casus,Casus I think it is a nominative but that doesn\u0027t fit with its function in the phrase could it be an accusative?","sequenceNumber":65,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601268000}}},{"id":{"value":"t1_gl5kal7"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl5avpy"},"distinguished":false,"created_utc":"21-01-28 23:07:22","author":"kc_kennylau","body":"Perfacilis inventu ex [Wiktionary](https://en.wiktionary.org/wiki/casus#Latin)","sequenceNumber":66,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601343000}}},{"id":{"value":"t1_gl7qke4"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl5kal7"},"distinguished":false,"created_utc":"21-01-29 09:59:22","author":"ContuberniumSPQR","body":" Gratias tibi ago","sequenceNumber":67,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601398000}}},{"id":{"value":"t1_gl6kjfb"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-29 03:30:49","author":"Havatra","body":"Hello!\n\nI need some translation help with this sentence:\n\n\"Remember you must die, so [do] thrive(vigorously), while you are [still] able to.\" \n\"Memento mori; ita vigemusque, dum es possunt.\"\n\nDoes this sound correct? Or is there perhaps a different way you\u0027d rather put it?\n\nI appreciate all help! :-)","sequenceNumber":68,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601454000}}},{"id":{"value":"t1_gl7b1cb"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-29 07:03:49","author":"Ghost_1243","body":"Hello, I\u0027m trying to translate the following the phrases:\n\n1. A Love of One\u0027s Fate\n2. The Will to Power\n3. Strive for a Higher Purpose\n4. Embrace the Ordinary","sequenceNumber":69,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601510000}}},{"id":{"value":"t1_gl7bfpk"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-29 07:07:53","author":"DV5161","body":"I?m having trouble translating ?live and die? so when I put that into google translate I get back ?vivere et mori? but when I switch it and go from Latin to English I get back ?live and? but when I put in ?vivamus, moriendum est.? I get back ?live and die? if someone could help me with which is right and wrong it would be greatly appreciated!","sequenceNumber":70,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601564000}}},{"id":{"value":"t1_gl7hq77"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl7bfpk"},"distinguished":false,"created_utc":"21-01-29 08:13:05","author":"kc_kennylau","body":"Anything google translate says is wrong.\n\n\"live and die\" \u003d vive et morere (command to 1 person) / vivete et morimini (command to multiple people)\n\nYou can also replace the \"et\" to \"atque\".","sequenceNumber":71,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601618000}}},{"id":{"value":"t1_gl7hzbb"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gl7hq77"},"distinguished":false,"created_utc":"21-01-29 08:15:50","author":"DV5161","body":"Lmao I had no idea but ok nice, glad to have got that cleared up thank you so much!","sequenceNumber":72,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601680000}}},{"id":{"value":"t1_gkuwp93"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t3_l5ejhj"},"distinguished":false,"created_utc":"21-01-26 20:31:29","author":"ki4clz","body":"Romanes eunt domis...?","sequenceNumber":73,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601737000}}},{"id":{"value":"t1_gkuzut8"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkuwp93"},"distinguished":false,"created_utc":"21-01-26 20:52:42","author":"kc_kennylau","body":"People called Romanes, they go, the house?","sequenceNumber":74,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601792000}}},{"id":{"value":"t1_gkvxdoo"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkuzut8"},"distinguished":false,"created_utc":"21-01-27 00:50:38","author":"ki4clz","body":"It says *Romans Go Home!*...","sequenceNumber":75,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601848000}}},{"id":{"value":"t1_gkx8lf8"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkvxdoo"},"distinguished":false,"created_utc":"21-01-27 08:10:02","author":"rsotnik","body":"It doesn\u0027t :)\n\nhttps://en.m.wikipedia.org/wiki/Romani_ite_domum","sequenceNumber":76,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601904000}}},{"id":{"value":"t1_gkxml3q"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkx8lf8"},"distinguished":false,"created_utc":"21-01-27 11:36:21","author":"ki4clz","body":"but latin for Roman is, Romanus...?","sequenceNumber":77,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":601957000}}},{"id":{"value":"t1_gkxn1hb"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkxml3q"},"distinguished":false,"created_utc":"21-01-27 11:42:53","author":"rsotnik","body":"Sorry, you lost me :) Do you want to know what \" Romanes eunt domis\" means or how one says \"Romans, go home!\"?","sequenceNumber":78,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":602016000}}},{"id":{"value":"t1_gkxo45g"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkxn1hb"},"distinguished":false,"created_utc":"21-01-27 11:58:33","author":"ki4clz","body":"Some kind soul was patient with me in this thread, and showed me the way: *(you may need to check their work?)*\n\nhttps://www.reddit.com/r/dankchristianmemes/comments/kp0ua2/a_catholic_a_protestant_and_an_orthodox_walk_into/ghv5c44?utm_medium\u003dandroid_app\u0026utm_source\u003dshare\u0026context\u003d3","sequenceNumber":79,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":602068000}}},{"id":{"value":"t1_gkxoeil"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkxo45g"},"distinguished":false,"created_utc":"21-01-27 12:02:38","author":"rsotnik","body":"Haha, you played me all along :)","sequenceNumber":80,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":602120000}}},{"id":{"value":"t1_gkxok0c"},"submission_id":{"id":"t3_l5ejhj"},"parent_id":{"id":"t1_gkxoeil"},"distinguished":false,"created_utc":"21-01-27 12:04:44","author":"ki4clz","body":"I\u0027ve only ever had one person take the bait... was hoping for a rematch...\n\n***Happy Cake Day!***\n-","sequenceNumber":81,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":602170000}}},{"id":{"value":"t1_gl5au93"},"submission_id":{"id":"t3_l765yl"},"parent_id":{"id":"t3_l765yl"},"distinguished":false,"created_utc":"21-01-28 22:10:49","author":"CabezadeVaca_","body":"Russian priests singing in Latin?? Very beautiful but also very odd","sequenceNumber":83,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":53,"nano":161325000}}},{"id":{"value":"t1_gl5h3k4"},"submission_id":{"id":"t3_l765yl"},"parent_id":{"id":"t1_gl5au93"},"distinguished":false,"created_utc":"21-01-28 22:48:23","author":"HanSo1oCup","body":"*Russian Orthodox* but the monastery is located in WV, USA","sequenceNumber":84,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":53,"nano":161442000}}},{"id":{"value":"t1_gl5i3sq"},"submission_id":{"id":"t3_l765yl"},"parent_id":{"id":"t1_gl5h3k4"},"distinguished":false,"created_utc":"21-01-28 22:54:09","author":"CabezadeVaca_","body":"Well that seems especially odd to me if they?re not even Eastern Catholics. I?ve never understood the Russian or the Greek churches to have much respect for Latin as a liturgical language, but of course that?s just based on my own experiences","sequenceNumber":85,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":53,"nano":161497000}}},{"id":{"value":"t1_gl5pu5j"},"submission_id":{"id":"t3_l765yl"},"parent_id":{"id":"t1_gl5i3sq"},"distinguished":false,"created_utc":"21-01-28 23:45:00","author":"greetings_traveler2","body":"yeah, it\u0027s pretty uncommon for Orthodox priests, I love their chants in ancient Greek though","sequenceNumber":86,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":53,"nano":161547000}}},{"id":{"value":"t1_gl4um7f"},"submission_id":{"id":"t3_l765yl"},"parent_id":{"id":"t3_l765yl"},"distinguished":false,"created_utc":"21-01-28 20:34:13","author":"HanSo1oCup","body":"Correction *It was adapted by the monks in honor of the Holy Prophet David*","sequenceNumber":87,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":53,"nano":161627000}}},{"id":{"value":"t1_gl5n4sd"},"submission_id":{"id":"t3_l765yl"},"parent_id":{"id":"t3_l765yl"},"distinguished":false,"created_utc":"21-01-28 23:26:23","author":"marktwainbrain","body":"Is this a group that has a ?Western Rite? orthodox parish?","sequenceNumber":88,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":53,"nano":161677000}}},{"id":{"value":"t1_gl64il6"},"submission_id":{"id":"t3_l765yl"},"parent_id":{"id":"t1_gl5n4sd"},"distinguished":false,"created_utc":"21-01-29 01:30:08","author":"greetings_traveler2","body":"Is that a thing?","sequenceNumber":89,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":53,"nano":161724000}}},{"id":{"value":"t1_gl64vsg"},"submission_id":{"id":"t3_l765yl"},"parent_id":{"id":"t1_gl64il6"},"distinguished":false,"created_utc":"21-01-29 01:32:52","author":"marktwainbrain","body":"Yes, not at all common. An overture to some Latin traditionalists. I think OCA has Western Rite parishes.\n\nETA: https://en.m.wikipedia.org/wiki/Western_Rite_Orthodoxy","sequenceNumber":90,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":53,"nano":161770000}}},{"id":{"value":"t1_gl5m4eg"},"submission_id":{"id":"t3_l7aez5"},"parent_id":{"id":"t3_l7aez5"},"distinguished":false,"created_utc":"21-01-28 23:19:33","author":"qed1","body":"There is only one appropriate manuscript to share on such an occasion as this: https://digi.vatlib.it/view/MSS_Vat.lat.9850/0016.\n\n^^^^^^^^(Not ^^^^^^^sure ^^^^^^^why ^^^^^^^there ^^^^^^^are ^^^^^^^two ^^^^^^^threads, ^^^^^^^only ^^^^^^^one ^^^^^^^of ^^^^^^^which ^^^^^^^I\u0027m ^^^^^^^seeing ^^^^^^^at ^^^^^^^a ^^^^^^^given ^^^^^^^time... ^^^^^^^but ^^^^^^^this ^^^^^^^should ^^^^^^^obviously ^^^^^^^be ^^^^^^^seen ^^^^^^^in ^^^^^^^both!)","sequenceNumber":92,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":55,"nano":144469000}}},{"id":{"value":"t1_gl730gk"},"submission_id":{"id":"t3_l7aez5"},"parent_id":{"id":"t1_gl5m4eg"},"distinguished":false,"created_utc":"21-01-29 05:52:40","author":"SheepExplosion","body":"Things that make me long for Merovingian chancery hands. Good thing he had 4 scribes or no one would have ever known what he wrote.","sequenceNumber":93,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":55,"nano":144676000}}},{"id":{"value":"t1_gl5ns9x"},"submission_id":{"id":"t3_l7aez5"},"parent_id":{"id":"t1_gl5m4eg"},"distinguished":false,"created_utc":"21-01-28 23:30:47","author":"Kingshorsey","body":"Are you sure that link goes where you want? The title page is nice, but your link goes to a random page in the middle.","sequenceNumber":94,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":55,"nano":144774000}}},{"id":{"value":"t1_gl5oqvi"},"submission_id":{"id":"t3_l7aez5"},"parent_id":{"id":"t1_gl5ns9x"},"distinguished":false,"created_utc":"21-01-28 23:37:26","author":"qed1","body":"That was the point, yes. I was aiming to land at a section of Aquinas\u0027s near incomprehensible handwriting. ;)","sequenceNumber":95,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":55,"nano":144866000}}},{"id":{"value":"t1_gl5ocsz"},"submission_id":{"id":"t3_l7aez5"},"parent_id":{"id":"t1_gl5m4eg"},"distinguished":false,"created_utc":"21-01-28 23:34:45","author":"EmergencySufficient6","body":"... What is *that?* \n\nI thought for sure it\u0027d be his musings upon [the effects of stars on demons](http://www.logicmuseum.com/wiki/Authors/Thomas_Aquinas/Summa_Theologiae/Part_I/Q115#q115a5arg1) or something similar.","sequenceNumber":96,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":55,"nano":144970000}}},{"id":{"value":"t1_gl5ovhj"},"submission_id":{"id":"t3_l7aez5"},"parent_id":{"id":"t1_gl5ocsz"},"distinguished":false,"created_utc":"21-01-28 23:38:18","author":"qed1","body":"It\u0027s Aquinas\u0027s totally incomprehensible handwriting.","sequenceNumber":97,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":55,"nano":145068000}}},{"id":{"value":"t1_gl5rrc7"},"submission_id":{"id":"t3_l7aez5"},"parent_id":{"id":"t1_gl5ovhj"},"distinguished":false,"created_utc":"21-01-28 23:58:24","author":"EmergencySufficient6","body":"How sure are we that there\u0027s a teleological argument against gays in there?\n\nEdit: Forgive me, downvote brigade, for I have sinned.","sequenceNumber":98,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":55,"nano":145174000}}},{"id":{"value":"t1_gl5kbyc"},"submission_id":{"id":"t3_l7aez5"},"parent_id":{"id":"t3_l7aez5"},"distinguished":false,"created_utc":"21-01-28 23:07:37","author":"Kingshorsey","body":"Source: [https://digi.vatlib.it/mss/detail/Urb.lat.136](https://digi.vatlib.it/mss/detail/Urb.lat.136)","sequenceNumber":99,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":55,"nano":145265000}}},{"id":{"value":"t1_gl315u1"},"submission_id":{"id":"t3_l6s45e"},"parent_id":{"id":"t3_l6s45e"},"distinguished":false,"created_utc":"21-01-28 14:21:53","author":"joaojcorreia","body":"Hi u/Irene_SaturaLanx, I was able to see part of the session live, really good. Congratulations. I was really happy, because I was able to follow it. Gratias.","sequenceNumber":103,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":59,"nano":155093000}}},{"id":{"value":"t1_gl3oyaf"},"submission_id":{"id":"t3_l6s45e"},"parent_id":{"id":"t1_gl315u1"},"distinguished":false,"created_utc":"21-01-28 16:28:46","author":"Irene_SaturaLanx","body":"Gratias tibi!","sequenceNumber":104,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":59,"nano":155276000}}},{"id":{"value":"t1_gl31nts"},"submission_id":{"id":"t3_l6s45e"},"parent_id":{"id":"t3_l6s45e"},"distinguished":false,"created_utc":"21-01-28 14:25:30","author":"ironicsadboy","body":"Congratulations on your work!","sequenceNumber":105,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":59,"nano":155374000}}},{"id":{"value":"t1_gl3oz4e"},"submission_id":{"id":"t3_l6s45e"},"parent_id":{"id":"t1_gl31nts"},"distinguished":false,"created_utc":"21-01-28 16:28:53","author":"Irene_SaturaLanx","body":"Thanks!","sequenceNumber":106,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":59,"nano":155466000}}},{"id":{"value":"t1_gl45js7"},"submission_id":{"id":"t3_l6s45e"},"parent_id":{"id":"t3_l6s45e"},"distinguished":false,"created_utc":"21-01-28 17:53:49","author":"logatwork","body":"I\u0027m reading/studying this book! Spoilers ahead!\n\nThank you","sequenceNumber":107,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":59,"nano":155556000}}},{"id":{"value":"t1_gl4w10f"},"submission_id":{"id":"t3_l6s45e"},"parent_id":{"id":"t3_l6s45e"},"distinguished":false,"created_utc":"21-01-28 20:42:52","author":"Redbubbles55","body":"hunc librem nunc perlego, sperans posthac linguam latinam modo eius docere. gaudeo multum sessionem tuam spectauisse, gratias ueras tibi ago !","sequenceNumber":108,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":59,"nano":155643000}}},{"id":{"value":"t1_gl510zz"},"submission_id":{"id":"t3_l6s45e"},"parent_id":{"id":"t3_l6s45e"},"distinguished":false,"created_utc":"21-01-28 21:12:53","author":"Monsieurantipyrine","body":"Truly one (or two) of the best texts out there for Latin students! Very glad I learned from this back in University.","sequenceNumber":109,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":59,"nano":155732000}}},{"id":{"value":"t1_gl58d7w"},"submission_id":{"id":"t3_l6s45e"},"parent_id":{"id":"t3_l6s45e"},"distinguished":false,"created_utc":"21-01-28 21:56:31","author":"scriptapuella","body":"My university is grammar focussed, but I teach with this book (alongside the companion) to aid comprehension and get students used to continuous Latin passages early. Evaluations indicate they like it more than Wheelock, at any rate.","sequenceNumber":110,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":59,"nano":155832000}}},{"id":{"value":"t1_gl7scfq"},"submission_id":{"id":"t3_l7iwsm"},"parent_id":{"id":"t3_l7iwsm"},"distinguished":false,"created_utc":"21-01-29 10:22:23","author":"kc_kennylau","body":"* You dare to drink wine in my presence?\n\n\"you dare to drink wine\" is indeed \"vinum bibere audes\", but \"ante\" takes the accusative instead of the dative, so it would be \"ante me\" instead of \"ante mi\". Other possible translations of \"in my presence\" include \"coram me\" and \"prae me\".\n\nE.g. Exodus 23:3 Non habebis deos alienos **coram me**. \"Thou shalt have no other gods **before me**.\"\n\n\u0026#x200B;\n\n* Because of covid, two million people are dead.\n\n[Latin Wikipedia](https://la.wikipedia.org/wiki/COVID-19) translates COVID-19 as \"morbus coronarii viri anni 2019\" and notes that this is their internal translation only, not found outside Wikipedia. To be safe, I would use COVID-19, but this does not admit cases. I guess in the end I would prefer \"morbus coronarii viri\" which does admit cases.\n\nNote that \"quia\" needs to be followed by a phrase, or put simply, \"quia\" \u003d \"because\" not \"because of\". I would use \"propter\" for \"because of\", or I would just use the ablative.\n\n\"concident\" is the future tense. The perfect tense \"conciderunt\" is more appropriate.\n\nIn conclusion: \"propter morbum coronarii viri, duo milliones homines conciderunt.\" or \"morbo coronarii viri, duo milliones homines conciderunt.\"","sequenceNumber":112,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":1,"nano":89793000}}},{"id":{"value":"t1_gl6k27a"},"submission_id":{"id":"t3_l7carp"},"parent_id":{"id":"t3_l7carp"},"distinguished":false,"created_utc":"21-01-29 03:27:06","author":"jayzwasinnirvana","body":"You\u0027ve almost got it - the verb is right, but consul should be in the nominative. It\u0027s not a direct object, but a subject (nominative? maybe someone can chime in with the grammatical term) complement. Here\u0027s Livy:\n\n\n\u003eDecembri mense summo patrum studio L.\tQuinctius Cincinnatus, pater Caesonis, **consul creatur**","sequenceNumber":114,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":3,"nano":83507000}}},{"id":{"value":"t1_gl6wvd8"},"submission_id":{"id":"t3_l7carp"},"parent_id":{"id":"t1_gl6k27a"},"distinguished":false,"created_utc":"21-01-29 05:03:53","author":"ogorangeduck","body":"It\u0027s an attributive, not a subject.","sequenceNumber":115,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":3,"nano":83662000}}},{"id":{"value":"t1_gl6u4mq"},"submission_id":{"id":"t3_l7gja1"},"parent_id":{"id":"t3_l7gja1"},"distinguished":false,"created_utc":"21-01-29 04:42:50","author":"isolde100","body":"Quis ut Deus refers to St. Michael the Archangel - it means ?who is like God?. It?s the literal translation of the Hebrew Michael or Mika?el.","sequenceNumber":117,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":5,"nano":86997000}}},{"id":{"value":"t1_gl7941t"},"submission_id":{"id":"t3_l7gja1"},"parent_id":{"id":"t1_gl6u4mq"},"distinguished":false,"created_utc":"21-01-29 06:45:24","author":"LurkinOG","body":"You are right..i looked up old latin versions of the I and one is shaped like what i thought what was a 3 but its a capital that looks like this only L only reversed £..what is that quote significant to..i know when translated to todays english its hard to know the meaning or significance without context..and thank you for sharing your knowledge in figuring this out","sequenceNumber":118,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":5,"nano":87152000}}},{"id":{"value":"t1_gl6lb05"},"submission_id":{"id":"t3_l7gja1"},"parent_id":{"id":"t3_l7gja1"},"distinguished":false,"created_utc":"21-01-29 03:36:43","author":"TWFM","body":"Qu3s is apparently an internet personality.","sequenceNumber":119,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":5,"nano":87249000}}},{"id":{"value":"t1_gl6mzss"},"submission_id":{"id":"t3_l7gja1"},"parent_id":{"id":"t1_gl6lb05"},"distinguished":false,"created_utc":"21-01-29 03:49:30","author":"LurkinOG","body":"I thought so to but old latin Ques can be translated to mean seek/ask its where you end up with english words like ques-tion, ques-t..3 could signify both holy trinity and a internet personality","sequenceNumber":120,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":5,"nano":87345000}}},{"id":{"value":"t1_gl88ce7"},"submission_id":{"id":"t3_l7gja1"},"parent_id":{"id":"t3_l7gja1"},"distinguished":false,"created_utc":"21-01-29 13:40:59","author":"BaconJudge","body":"There\u0027s a common Latin scribal abbreviation that resembles a 3, and it can stand in for various letter combinations, such as *-et*. For example, the modern-day abbreviation *viz.* for *videlicet* originated from misinterpreting *vi?* as *viz.* However, it\u0027s normally used at the end of words, and I can\u0027t see what it would represent here, so I mention it mainly to rule it out.","sequenceNumber":121,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":5,"nano":87445000}}},{"id":{"value":"t1_gl1hr7e"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t3_l6mdp8"},"distinguished":false,"created_utc":"21-01-28 04:06:11","author":"antinousrex","body":"feel free to point out errors, please!","sequenceNumber":123,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177467000}}},{"id":{"value":"t1_gl1tfor"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t1_gl1hr7e"},"distinguished":false,"created_utc":"21-01-28 05:44:51","author":"Thalionwen20","body":"In section 4, it should be \"undecimum,\" accusative. In the last line of section 5, it should be \"et,\" not \"and.\" In 9, you might want to say \"inter se\" instead of \"eorum.\" In 10, it should be \"annulum\" and \"esse\" for indirect speech, or \"annulus delendus\" for a direct quote. In 11, you only need one \"est\" and a \"qui\" before \"nunc.\" In 12, it should be \"mortuos.\" In 15, \"Galadriela\" and \"principe\" are misspelled. In 16, you probably just want \"interficit,\" and I would put another verb such as \"incipit\" with \"iter facere.\"\n\nDespite that, I really enjoyed this and think you did a good job on it! :)","sequenceNumber":124,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177562000}}},{"id":{"value":"t1_gl2y8de"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t1_gl1tfor"},"distinguished":false,"created_utc":"21-01-28 13:57:47","author":"eglwufdeo","body":"Some more I found: \n\nIn section 1 \n\"ignotum eis\" seems sketchy, I would expect an ablative absolute \n\"eius\" should be \"suae\" \n\nIn section 2 \n\"foedum\" should be \"foedus\", having it as the subject seems a bit weird but I\u0027m not sure \n\"miletes\" should be \"milites\" \nI think the second sentence misses a verb \n\"saeculum\" not \"saecula\" \n\nIn section 3 \n\"pro se\" not \"pro sibi\" \n\nIn section 4 \n\"centesimus\" not \"centisimus\" \ndon\u0027t think \"discedere\" takes an accusative like that \n\"faciat\" not \"faceat\" \n\"suo\" not \"eius\" \n\nIn section 5 \n\"certior\" not \"certiorem\" \nYou need either indirect speech or some subjunction, but as it stands the sentence about Gollum\u0027s torture doesn\u0027t work \n\"aperiens verba\" doesn\u0027t work (match number), should be an ablative absolute \n\"discedat\" not \"discedeat\", also see above \n\nIn section 6 \nsame thing with \"certior\" \nthe relative clause needs a verb \n\nIn Section 7 \n\"Sarumano\" should be \"a Sarumano\" \n\"adiuvantur\" not \"adiuntur\" \n\"a venatore\" not \"venatori\" \n\nIn section 8 \nDon\u0027t know if \"do\" is the best verb here , maybe \"parare\"? \n\nIn section 9 \n\"curatur\" should probably be plural \n\"suum\" not \"eorum\" \n\nIn section 10 \n\"ambo\" doesn\u0027t work \n\nIn section 11 \nI don\u0027t think \"se voluntare\" is a thing \n\"comitatus\" not \"commitatus\" \n\nIn section 12 \n\"quae\" not \"qui\" \n\nIn section 13 \nThink there\u0027s a verb missing after \"vastum antrum\" \n\"cum se\" not \"cum sibi\" \n\nIn section 14 \n\"devastata\" not \"devastatus\" \n\"capturum\" not \"capturus\" \n\nIn section 15 \nI think \"per\" is the wrong preposition \n\nIn section 16 \n\"suos\" not \"eius\" \ndon\u0027t think \"iurandum\" works like that","sequenceNumber":125,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177599000}}},{"id":{"value":"t1_gl3du83"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t1_gl2y8de"},"distinguished":false,"created_utc":"21-01-28 15:37:05","author":"antinousrex","body":"Thank you so much!","sequenceNumber":126,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177643000}}},{"id":{"value":"t1_gl3dsb3"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t1_gl1tfor"},"distinguished":false,"created_utc":"21-01-28 15:36:49","author":"antinousrex","body":"Thank you so much! I do these off the seat of my pants and rely on people like you to catch the errors I miss after staring at my own text for 3 hrs","sequenceNumber":127,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177681000}}},{"id":{"value":"t1_gl27bkf"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t1_gl1hr7e"},"distinguished":false,"created_utc":"21-01-28 08:11:56","author":"Julius_The_Caesar","body":"Happy cake day","sequenceNumber":128,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177721000}}},{"id":{"value":"t1_gl212je"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t3_l6mdp8"},"distinguished":false,"created_utc":"21-01-28 06:59:30","author":"PinkyPiePerson","body":"You wouldn\u0027t happen to have the link to the Shrek one.\n\nAlso Bee Movie next???","sequenceNumber":129,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177763000}}},{"id":{"value":"t1_gl3hymr"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t1_gl212je"},"distinguished":false,"created_utc":"21-01-28 15:56:22","author":"antinousrex","body":"here\u0027s shrek. bee movie is in progress [https://docs.google.com/document/d/1-0GY-JqbusyDbOp7aaRJy1q8rY4LSTMQbccTek7673Q/edit?usp\u003dsharing](https://docs.google.com/document/d/1-0GY-JqbusyDbOp7aaRJy1q8rY4LSTMQbccTek7673Q/edit?usp\u003dsharing)","sequenceNumber":130,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177822000}}},{"id":{"value":"t1_gl2ad9g"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t3_l6mdp8"},"distinguished":false,"created_utc":"21-01-28 08:50:55","author":"-Frind-","body":"Is it classical?","sequenceNumber":131,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177859000}}},{"id":{"value":"t1_gl2tdy1"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t1_gl2ad9g"},"distinguished":false,"created_utc":"21-01-28 13:07:27","author":"OperaRotas","body":"Of course Lord of the Rings is a classical\n\n_[jocor]_","sequenceNumber":132,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177893000}}},{"id":{"value":"t1_gl2bz5e"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t3_l6mdp8"},"distinguished":false,"created_utc":"21-01-28 09:12:40","author":"MadScientist2854","body":"happy cake day!","sequenceNumber":133,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177926000}}},{"id":{"value":"t1_gl42d9n"},"submission_id":{"id":"t3_l6mdp8"},"parent_id":{"id":"t3_l6mdp8"},"distinguished":false,"created_utc":"21-01-28 17:37:36","author":"OperaRotas","body":"\u003eSed potestas Annuli Isildurem corrumpit, qui Annulum pro se capit\n\nThis \"pro se\" sounds a bit weird to me, kinda \"he takes the ring for his own sake\". Maybe \"sibi\" would work better?\n\nI\u0027m also not sure if this \"capit\" means \"takes\", as in right after cutting Sauron\u0027s finger (in which case it\u0027s fine) or \"keeps\", as in \"keeps the ring longer than he should\", in which case \"tenet\" could be better. Any way I don\u0027t know if \"sibi tenet\" works well.","sequenceNumber":134,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":177969000}}},{"id":{"value":"t1_gl2mk6t"},"submission_id":{"id":"t3_l6thod"},"parent_id":{"id":"t3_l6thod"},"distinguished":false,"created_utc":"21-01-28 11:41:58","author":"bandzugfeder","body":"I\u0027ll check with the reference grammars. My guess beforehand (to record my failure for posterity) is that the two genitives would be on either side of the noun,or otherwise separated (eg by a determiner or a case-marked attribute). But I would also guess that it is a comparatively rare occurrence.","sequenceNumber":136,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":9,"nano":96392000}}},{"id":{"value":"t1_gl4dsl6"},"submission_id":{"id":"t3_l6thod"},"parent_id":{"id":"t1_gl2mk6t"},"distinguished":false,"created_utc":"21-01-28 18:38:18","author":"j1bb3r1sh","body":"One example I can recall that may apply here, if I understand the question correctly, is from Sallust?s *Bellum Catilinae* chapter 2.3, ?*Quod si regum atque imperatorum animi virtus in pace ita ut in bello valeret*...? if that would help in your research. \n\nI?m not certain if the situation warrants defining it as its own construction, but it was unique enough that I remembered it six months after reading it. I?d be interested to hear if you are able to find anything else on this. \n\nAlso apologies for formatting if it is messed up, I am on mobile","sequenceNumber":137,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":9,"nano":96530000}}}],"threads":[{"id":{"value":"t3_l5ejhj"},"parent_id":{"value":"t5_2qloa"},"title":"English to Latin translation requests go here!","body":" \n\n1. Ask and answer questions about mottos, tattoos, book titles, lines for your poem, slogans for your bowling club?s t-shirt, etc. in the comments of this thread. **Separate posts for these types of requests will be removed.**\n2. Here are some examples of what types of requests this thread is for: [Example #1](https://www.reddit.com/r/latin/comments/dyqs8p/would_the_correct_translation_of_satans_sister_be/?utm_source\u003dshare\u0026utm_medium\u003dweb2x), [Example #2](https://www.reddit.com/r/latin/comments/dyp18o/translation_from_english/?utm_source\u003dshare\u0026utm_medium\u003dweb2x), [Example #3](https://www.reddit.com/r/latin/comments/dy4o7b/i_need_help_in_translating_correctly_these_2_words/?utm_source\u003dshare\u0026utm_medium\u003dweb2x), [Example #4](https://www.reddit.com/r/latin/comments/dxdzpb/are_there_any_words_that_convey_the_idea_of_a/?utm_source\u003dshare\u0026utm_medium\u003dweb2x), [Example #5](https://www.reddit.com/r/latin/comments/dx5xzc/motto_in_latin/?utm_source\u003dshare\u0026utm_medium\u003dweb2x).\n3. This thread is **not for correcting longer translations and student assignments**. If you have some facility with the Latin language and have made an honest attempt to translate that is **NOT from Google Translate**, Yandex, or any other machine translator, create a separate thread requesting to check and correct your translation: [Separate thread example](https://www.reddit.com/r/latin/comments/dyjz4m/motto_idea_for_motorbike/). Make sure to take a look at Rule 4.\n4. [Previous iterations of this thread](https://www.reddit.com/r/latin/search/?q\u003dLatin%20translation%20requests%20here\u0026restrict_sr\u003d1).","url":"https://www.reddit.com/r/latin/comments/l5ejhj/english_to_latin_translation_requests_go_here/","sub":"latin","author":"NasusSyrae","num_comments":80,"created_utc":"21-01-26 15:00:22","sequenceNumber":82,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":51,"nano":602455000}}},{"id":{"value":"t3_l765yl"},"parent_id":{"value":"t5_2qloa"},"title":"A hymn dedicated to Saint Nicholas, performed by two monks of Holy Cross Monastery.","body":"","url":"https://v.redd.it/4n18nwz2k4e61","sub":"latin","author":"HanSo1oCup","num_comments":8,"created_utc":"21-01-28 20:30:17","sequenceNumber":91,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":53,"nano":161815000}}},{"id":{"value":"t3_l7aez5"},"parent_id":{"value":"t5_2qloa"},"title":"Hodie est Festum S. Thomae de Aquino - Ecce Pagina Illustrata","body":"","url":"https://i.redd.it/nar1o78fc5e61.jpg","sub":"latin","author":"Kingshorsey","num_comments":8,"created_utc":"21-01-28 23:07:28","sequenceNumber":100,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":55,"nano":145355000}}},{"id":{"value":"t3_l7isan"},"parent_id":{"value":"t5_2qloa"},"title":"Verbum Diei, die Jovis, A D V KAL FEB, anni AUC MMDCCLXXIV: excaeco","body":"Verbum diei hodie est:\n\nexcaeco, excaecare, excaecvi, excaecatum: to blind, to make blind\n\n1st conjugation verb\n\n*Frequens curatio est uenas in temporibus adurere, quae fere quidem in eiusmodi malo tument: sed tamen, ut inflentur magisque se ostendant, ceruix ante modice deliganda est, tenuibusque ferramentis et retussis uenae adurendae, donec in oculis pituitae cursus conquiescat. Id enim signum est quasi excaecatorum itinerum, per quae umor ferebatur*\n\nCelsus, *de Medicina*, 7.7","url":"https://www.reddit.com/r/latin/comments/l7isan/verbum_diei_die_jovis_a_d_v_kal_feb_anni_auc/","sub":"latin","author":"Glofkill","num_comments":0,"created_utc":"21-01-29 05:10:12","sequenceNumber":101,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":56,"nano":63285000}}},{"id":{"value":"t3_l7sduu"},"parent_id":{"value":"t5_2qloa"},"title":"I can?t understand this sub sometimes.","body":"I?m feeling lost when it comes to the natural method vs grammar method as laid out on this sub as the best way to learn and understand Latin. I am using LLPSI to learn Latin. Everyone posts about the subjunctive and the perfect and all the other what I believe are grammar rules or modes etc, and I?m over here thinking that none of that is In LLPSI. I feel like there is a whole world of information that I?m not getting because while I know that est is for singular and sunt is for plural, I have no idea what anything else is. I can parse our meaning from reading and context clues, but when or how should I learn to get into the grammar? Right now it?s all story and vocab- I know what I?m reading but I have no idea why it goes its changing in spelling etc. is this normal for reading LLPSI?","url":"https://www.reddit.com/r/latin/comments/l7sduu/i_cant_understand_this_sub_sometimes/","sub":"latin","author":"Squeeks0","num_comments":0,"created_utc":"21-01-29 14:22:11","sequenceNumber":102,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":57,"nano":85717000}}},{"id":{"value":"t3_l6s45e"},"parent_id":{"value":"t5_2qloa"},"title":"If you want to see how a lesson with \"Familia Romana\" works, you can now watch the first lesson on my YouTube channel! ? I can\u0027t recommend that book more to anyone who decides to learn Latin, be it with a teacher or alone.","body":"","url":"https://www.youtube.com/watch?v\u003dwCO_McKXEzA\u0026lc\u003dUgwFAsByoLpr_pnXNqZ4AaABAg.9IykSUZhPwo9J0N2_fqeL2\u0026ab_channel\u003dSaturaLanx","sub":"latin","author":"Irene_SaturaLanx","num_comments":8,"created_utc":"21-01-28 09:50:50","sequenceNumber":111,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":59,"nano":155917000}}},{"id":{"value":"t3_l7iwsm"},"parent_id":{"value":"t5_2qloa"},"title":"I like to do random translations; could someone check these and point out the errors?","body":"Vinum bibere audes ante mi?\n\nYou dare to drink wine in my presence?\n\nQuia coronavirus-morbus, duo milliones homines concident.\n\nBecause of covid, two million people are dead.\n\nI know these will probably be full of errors, can someone check them pwease.","url":"https://www.reddit.com/r/latin/comments/l7iwsm/i_like_to_do_random_translations_could_someone/","sub":"latin","author":"seaweedWorkers","num_comments":1,"created_utc":"21-01-29 05:16:07","sequenceNumber":113,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":1,"nano":90014000}}},{"id":{"value":"t3_l7carp"},"parent_id":{"value":"t5_2qloa"},"title":"Passive translation","body":"How would you say this in latin: The man is elected consul. \n\nIs elected in the passive? but it doesn?t take a direct object\n\nVir consulem creatur?","url":"https://www.reddit.com/r/latin/comments/l7carp/passive_translation/","sub":"latin","author":"SnooDoggos8723","num_comments":2,"created_utc":"21-01-29 00:23:35","sequenceNumber":116,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":3,"nano":83765000}}},{"id":{"value":"t3_l7gja1"},"parent_id":{"value":"t5_2qloa"},"title":"can someone translate. \"Qu3s Ut Deus\" i saw it tattooed on a stranger and made me wonder if 3 was just the way some use 3 in place of E and that was a personal choice or am i missing hidden meaning. Ques Ut Deus, would that not translate roughly to who is god. Am i missing something?","body":"","url":"https://www.reddit.com/r/latin/comments/l7gja1/can_someone_translate_qu3s_ut_deus_i_saw_it/","sub":"latin","author":"LurkinOG","num_comments":5,"created_utc":"21-01-29 03:27:25","sequenceNumber":122,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":5,"nano":87535000}}},{"id":{"value":"t3_l6mdp8"},"parent_id":{"value":"t5_2qloa"},"title":"From the guy who brought you Phineas and Ferb \u0026 Shrek, here\u0027s the story of Lord of the Rings, The Fellowship of the Ring, in Latin!","body":"[https://docs.google.com/document/d/1gZ2lLzlrOuzxNWyNHczDNonVRlMAvwfIT--W3xLMTzk/edit?usp\u003dsharing](https://docs.google.com/document/d/1gZ2lLzlrOuzxNWyNHczDNonVRlMAvwfIT--W3xLMTzk/edit?usp\u003dsharing)","url":"https://www.reddit.com/r/latin/comments/l6mdp8/from_the_guy_who_brought_you_phineas_and_ferb/","sub":"latin","author":"antinousrex","num_comments":12,"created_utc":"21-01-28 04:05:36","sequenceNumber":135,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":7,"nano":178008000}}},{"id":{"value":"t3_l6thod"},"parent_id":{"value":"t5_2qloa"},"title":"Is \"double genitive\" possible in Latin?","body":"There are verbs that take double accusative (e.g. doce?) and there is also double dative (e.g. cu? bon?). These duplicated cases take on different meanings, e.g. in the double dative, [one dative is the dative of purpose and the other dative is the dative of reference](http://dcc.dickinson.edu/grammar/latin/dative-purpose).\n\nIs it possible to have double genitive for a noun that can take on two genitives out of (a) the objective genitive, (b) the partitive genitive, and (c) the genitive of possession?\n\nFor example, odium *barbar?rum* **civilizati?nis**, where *barbar?rum* is the genitive of possession and **civilizati?nis** is the objective genitive.\n\nAre such formations attested?","url":"https://www.reddit.com/r/latin/comments/l6thod/is_double_genitive_possible_in_latin/","sub":"latin","author":"kc_kennylau","num_comments":2,"created_utc":"21-01-28 11:25:02","sequenceNumber":138,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":9,"nano":96574000}}}],"subreddits":[{"name":"latin","title":"The Latin Language","id":{"value":"t5_2qloa"},"description":"This is a community for discussions related to the Latin language.","sequenceNumber":139,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":36,"second":10,"nano":75346000}}}],"top":[{"sequenceNumber":1,"crawlTime":{"date":{"year":2021,"month":1,"day":29},"time":{"hour":14,"minute":35,"second":47,"nano":893685000}}}]} diff --git a/marginalia_nu/build.gradle b/other/memex/build.gradle similarity index 94% rename from marginalia_nu/build.gradle rename to other/memex/build.gradle index 638c1e30..65460946 100644 --- a/marginalia_nu/build.gradle +++ b/other/memex/build.gradle @@ -59,10 +59,16 @@ jmhJar { zip64 true } dependencies { - implementation project(':third_party') + implementation project(':third-party') implementation project(':protocol') + implementation project(':common:service') + implementation project(':common:config') + implementation project(':libraries:misc') + implementation project(':common:service-discovery') + implementation project(':common:service-client') implementation 'org.projectlombok:lombok:1.18.24' + implementation 'org.jetbrains:annotations:20.1.0' annotationProcessor 'org.projectlombok:lombok:1.18.24' implementation 'com.github.jknack:handlebars:4.3.1' @@ -85,12 +91,10 @@ dependencies { implementation 'com.github.jnr:jnr-ffi:2.2.12' implementation 'org.apache.httpcomponents:httpcore:4.4.15' implementation 'org.apache.httpcomponents:httpclient:4.5.13' - implementation 'com.github.ThatJavaNerd:JRAW:1.1.0' implementation group: 'com.h2database', name: 'h2', version: '2.1.210' implementation 'org.jsoup:jsoup:1.15.3' - implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2' implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.6' implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3' @@ -120,15 +124,9 @@ dependencies { implementation 'org.eclipse.jgit:org.eclipse.jgit.ssh.jsch:5.12.0.202106070339-r' implementation 'com.jcraft:jsch:0.1.55' - implementation group: 'org.apache.commons', name: 'commons-compress', version: '1.21' - implementation 'edu.stanford.nlp:stanford-corenlp:4.4.0' - implementation group: 'it.unimi.dsi', name: 'fastutil', version: '8.5.8' implementation 'org.roaringbitmap:RoaringBitmap:0.9.32' - implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29' - implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0' - testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' testImplementation 'org.mockito:mockito-junit-jupiter:4.5.1' testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' diff --git a/marginalia_nu/lombok.config b/other/memex/lombok.config similarity index 100% rename from marginalia_nu/lombok.config rename to other/memex/lombok.config diff --git a/other/memex/src/main/java/nu/marginalia/memex/MemexServiceDescriptors.java b/other/memex/src/main/java/nu/marginalia/memex/MemexServiceDescriptors.java new file mode 100644 index 00000000..25ef5662 --- /dev/null +++ b/other/memex/src/main/java/nu/marginalia/memex/MemexServiceDescriptors.java @@ -0,0 +1,15 @@ +package nu.marginalia.memex; + +import nu.marginalia.memex.auth.AuthMain; +import nu.marginalia.service.descriptor.ServiceDescriptor; +import nu.marginalia.service.descriptor.ServiceDescriptors; +import nu.marginalia.service.id.ServiceId; + +import java.util.List; + +public class MemexServiceDescriptors { + public static ServiceDescriptors descriptors = new ServiceDescriptors( + List.of( + new ServiceDescriptor(ServiceId.Other_Memex, 5030), + new ServiceDescriptor (ServiceId.Other_Auth, 5003))); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthConfigurationModule.java b/other/memex/src/main/java/nu/marginalia/memex/auth/AuthConfigurationModule.java similarity index 69% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthConfigurationModule.java rename to other/memex/src/main/java/nu/marginalia/memex/auth/AuthConfigurationModule.java index 3a6772ae..e0ad33f5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthConfigurationModule.java +++ b/other/memex/src/main/java/nu/marginalia/memex/auth/AuthConfigurationModule.java @@ -1,12 +1,14 @@ -package nu.marginalia.wmsa.auth; +package nu.marginalia.memex.auth; import com.google.inject.AbstractModule; import com.google.inject.name.Names; +import nu.marginalia.service.descriptor.HostsFile; import java.nio.file.Path; public class AuthConfigurationModule extends AbstractModule { public void configure() { bind(Path.class).annotatedWith(Names.named("password-file")).toInstance(Path.of("/var/lib/wmsa/password.dat")); + bind(HostsFile.class).toInstance(new HostsFile()); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthMain.java b/other/memex/src/main/java/nu/marginalia/memex/auth/AuthMain.java similarity index 52% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthMain.java rename to other/memex/src/main/java/nu/marginalia/memex/auth/AuthMain.java index bb408581..e997d777 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthMain.java +++ b/other/memex/src/main/java/nu/marginalia/memex/auth/AuthMain.java @@ -1,14 +1,13 @@ -package nu.marginalia.wmsa.auth; +package nu.marginalia.memex.auth; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.server.Initialization; - -import java.io.IOException; +import nu.marginalia.memex.MemexServiceDescriptors; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.server.Initialization; public class AuthMain extends MainClass { @@ -17,11 +16,11 @@ public class AuthMain extends MainClass { } public static void main(String... args) { - init(ServiceDescriptor.AUTH, args); + MainClass.init(ServiceId.Other_Auth, args); Injector injector = Guice.createInjector( new AuthConfigurationModule(), - new ConfigurationModule()); + new ConfigurationModule(MemexServiceDescriptors.descriptors, ServiceId.Other_Auth)); injector.getInstance(AuthMain.class); injector.getInstance(Initialization.class).setReady(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthService.java b/other/memex/src/main/java/nu/marginalia/memex/auth/AuthService.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthService.java rename to other/memex/src/main/java/nu/marginalia/memex/auth/AuthService.java index 4c93db95..6f9ef59d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/AuthService.java +++ b/other/memex/src/main/java/nu/marginalia/memex/auth/AuthService.java @@ -1,11 +1,15 @@ -package nu.marginalia.wmsa.auth; +package nu.marginalia.memex.auth; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.wmsa.auth.model.LoginFormModel; -import nu.marginalia.wmsa.configuration.server.*; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; +import nu.marginalia.client.Context; +import nu.marginalia.memex.auth.model.LoginFormModel; +import nu.marginalia.memex.renderer.MustacheRenderer; +import nu.marginalia.memex.renderer.RendererFactory; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.MetricsServer; +import nu.marginalia.service.server.RateLimiter; +import nu.marginalia.service.server.Service; import org.apache.http.HttpStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/client/AuthClient.java b/other/memex/src/main/java/nu/marginalia/memex/auth/client/AuthClient.java similarity index 65% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/client/AuthClient.java rename to other/memex/src/main/java/nu/marginalia/memex/auth/client/AuthClient.java index 81cc95ba..f2d68667 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/client/AuthClient.java +++ b/other/memex/src/main/java/nu/marginalia/memex/auth/client/AuthClient.java @@ -1,24 +1,27 @@ -package nu.marginalia.wmsa.auth.client; +package nu.marginalia.memex.auth.client; +import com.google.gson.GsonBuilder; import com.google.inject.Inject; import io.reactivex.rxjava3.core.Observable; -import kotlin.text.Charsets; -import nu.marginalia.wmsa.client.AbstractDynamicClient; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.WmsaHome; +import nu.marginalia.client.AbstractDynamicClient; +import nu.marginalia.client.Context; +import nu.marginalia.service.descriptor.ServiceDescriptors; +import nu.marginalia.service.id.ServiceId; import org.apache.http.HttpStatus; import spark.Request; import spark.Response; import spark.Spark; import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; import java.util.concurrent.TimeUnit; public class AuthClient extends AbstractDynamicClient { @Inject - public AuthClient() { - super(ServiceDescriptor.AUTH); + public AuthClient(ServiceDescriptors descriptors) { + super(descriptors.forId(ServiceId.Other_Auth), WmsaHome.getHostsFile(), new GsonBuilder()::create); } public Observable isLoggedIn(Context ctx) { @@ -28,7 +31,7 @@ public class AuthClient extends AbstractDynamicClient { public void redirectToLoginIfUnauthenticated(String domain, Request req, Response rsp) { if (!isLoggedIn(Context.fromRequest(req)).timeout(1, TimeUnit.SECONDS).blockingFirst()) { rsp.redirect(req.headers("X-Extern-Domain") + "/auth/login?service="+domain - +"&redirect="+ URLEncoder.encode(req.headers("X-Extern-Url"), Charsets.UTF_8)); + +"&redirect="+ URLEncoder.encode(req.headers("X-Extern-Url"), StandardCharsets.UTF_8)); Spark.halt(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/model/LoginFormModel.java b/other/memex/src/main/java/nu/marginalia/memex/auth/model/LoginFormModel.java similarity index 82% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/model/LoginFormModel.java rename to other/memex/src/main/java/nu/marginalia/memex/auth/model/LoginFormModel.java index 09029161..f31876e0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/auth/model/LoginFormModel.java +++ b/other/memex/src/main/java/nu/marginalia/memex/auth/model/LoginFormModel.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.auth.model; +package nu.marginalia.memex.auth.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/BadBotList.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/BadBotList.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/BadBotList.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/BadBotList.java index 6f759cff..2b879a10 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/BadBotList.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/BadBotList.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini; +package nu.marginalia.memex.gemini; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiConfigurationModule.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiConfigurationModule.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiConfigurationModule.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiConfigurationModule.java index 2e6cda6b..2d269332 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiConfigurationModule.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiConfigurationModule.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini; +package nu.marginalia.memex.gemini; import com.google.inject.AbstractModule; import com.google.inject.name.Names; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiService.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiService.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiService.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiService.java index 7650c1a2..d5c6db9c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiService.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiService.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini; +package nu.marginalia.memex.gemini; public interface GeminiService { String DEFAULT_FILENAME = "index.gmi"; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceDummy.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceDummy.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceDummy.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceDummy.java index 81586f31..33fcffb2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceDummy.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceDummy.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini; +package nu.marginalia.memex.gemini; import com.google.inject.Singleton; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceImpl.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceImpl.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceImpl.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceImpl.java index 0381be48..27b956d9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/GeminiServiceImpl.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceImpl.java @@ -1,15 +1,15 @@ -package nu.marginalia.gemini; +package nu.marginalia.memex.gemini; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; -import nu.marginalia.gemini.io.GeminiConnection; -import nu.marginalia.gemini.io.GeminiSSLSetUp; -import nu.marginalia.gemini.io.GeminiStatusCode; -import nu.marginalia.gemini.io.GeminiUserException; -import nu.marginalia.gemini.plugins.BareStaticPagePlugin; -import nu.marginalia.gemini.plugins.Plugin; -import nu.marginalia.gemini.plugins.SearchPlugin; +import nu.marginalia.memex.gemini.io.GeminiConnection; +import nu.marginalia.memex.gemini.io.GeminiSSLSetUp; +import nu.marginalia.memex.gemini.io.GeminiStatusCode; +import nu.marginalia.memex.gemini.io.GeminiUserException; +import nu.marginalia.memex.gemini.plugins.BareStaticPagePlugin; +import nu.marginalia.memex.gemini.plugins.Plugin; +import nu.marginalia.memex.gemini.plugins.SearchPlugin; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/client/GeminiClient.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/client/GeminiClient.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/client/GeminiClient.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/client/GeminiClient.java index e306e88f..27d2a2a9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/client/GeminiClient.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/client/GeminiClient.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini.client; +package nu.marginalia.memex.gemini.client; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSocketFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/Gemtext.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/Gemtext.java similarity index 78% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/Gemtext.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/Gemtext.java index 3b07f4cc..692b65ce 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/Gemtext.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/Gemtext.java @@ -1,11 +1,11 @@ -package nu.marginalia.gemini.gmi; +package nu.marginalia.memex.gemini.gmi; import lombok.Getter; -import nu.marginalia.gemini.gmi.line.AbstractGemtextLine; -import nu.marginalia.gemini.gmi.parser.GemtextParser; -import nu.marginalia.gemini.gmi.renderer.GemtextRenderer; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine; +import nu.marginalia.memex.gemini.gmi.parser.GemtextParser; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRenderer; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.io.IOException; import java.io.Writer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/GemtextDatabase.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDatabase.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/GemtextDatabase.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDatabase.java index 33d33dd9..2beb1772 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/GemtextDatabase.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDatabase.java @@ -1,10 +1,10 @@ -package nu.marginalia.gemini.gmi; +package nu.marginalia.memex.gemini.gmi; import com.google.common.collect.Sets; -import nu.marginalia.gemini.gmi.line.GemtextLineVisitorAdapter; -import nu.marginalia.gemini.gmi.line.GemtextLink; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.model.MemexUrl; +import nu.marginalia.memex.gemini.gmi.line.GemtextLineVisitorAdapter; +import nu.marginalia.memex.gemini.gmi.line.GemtextLink; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexUrl; import java.io.IOException; import java.nio.file.Files; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/GemtextDocument.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDocument.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/GemtextDocument.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDocument.java index 1eeed2c8..8e347c6f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/GemtextDocument.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDocument.java @@ -1,13 +1,13 @@ -package nu.marginalia.gemini.gmi; +package nu.marginalia.memex.gemini.gmi; import lombok.Getter; -import nu.marginalia.gemini.gmi.line.*; -import nu.marginalia.gemini.gmi.renderer.GemtextRenderer; -import nu.marginalia.gemini.gmi.renderer.GemtextRendererFactory; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeTaskId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.model.MemexTaskState; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRenderer; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory; +import nu.marginalia.memex.gemini.gmi.line.*; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeTaskId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexTaskState; import org.apache.commons.lang3.tuple.Pair; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/AbstractGemtextLine.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/AbstractGemtextLine.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/AbstractGemtextLine.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/AbstractGemtextLine.java index f1307b9b..21e0bf4d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/AbstractGemtextLine.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/AbstractGemtextLine.java @@ -1,6 +1,6 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; import java.util.Optional; import java.util.function.Function; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextAside.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextAside.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextAside.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextAside.java index ef73accc..9c0b5632 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextAside.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextAside.java @@ -1,9 +1,9 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; @AllArgsConstructor @Getter @ToString public class GemtextAside extends AbstractGemtextLine { diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextHeading.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextHeading.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextHeading.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextHeading.java index a2c9f309..d969bc95 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextHeading.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextHeading.java @@ -1,10 +1,10 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; import java.util.Optional; import java.util.function.Function; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextLineVisitor.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextLineVisitor.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextLineVisitor.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextLineVisitor.java index 219267ca..ef3cb97a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextLineVisitor.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextLineVisitor.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; public interface GemtextLineVisitor { default T take(AbstractGemtextLine line) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextLineVisitorAdapter.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextLineVisitorAdapter.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextLineVisitorAdapter.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextLineVisitorAdapter.java index cb0a7544..8e00aae3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextLineVisitorAdapter.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextLineVisitorAdapter.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; public class GemtextLineVisitorAdapter implements GemtextLineVisitor { @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextLink.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextLink.java similarity index 82% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextLink.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextLink.java index 27aa1a5c..f4359946 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextLink.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextLink.java @@ -1,10 +1,10 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexUrl; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexUrl; import javax.annotation.Nullable; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextList.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextList.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextList.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextList.java index c06c1e6a..903b4f63 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextList.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextList.java @@ -1,9 +1,9 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextPragma.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextPragma.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextPragma.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextPragma.java index 082cef26..1baf6658 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextPragma.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextPragma.java @@ -1,9 +1,9 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; @AllArgsConstructor @Getter @ToString public class GemtextPragma extends AbstractGemtextLine { diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextPreformat.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextPreformat.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextPreformat.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextPreformat.java index 56a1f196..a83e70bc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextPreformat.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextPreformat.java @@ -1,9 +1,9 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextQuote.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextQuote.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextQuote.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextQuote.java index ad9f2e9b..5eb318a2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextQuote.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextQuote.java @@ -1,9 +1,9 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextTask.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextTask.java similarity index 76% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextTask.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextTask.java index d2360afc..4d371785 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextTask.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextTask.java @@ -1,12 +1,12 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeTaskId; -import nu.marginalia.wmsa.memex.model.MemexTaskState; -import nu.marginalia.wmsa.memex.model.MemexTaskTags; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeTaskId; +import nu.marginalia.memex.memex.model.MemexTaskState; +import nu.marginalia.memex.memex.model.MemexTaskTags; import java.util.Optional; import java.util.function.Function; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextText.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextText.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextText.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextText.java index 15394533..80a84e7e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextText.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextText.java @@ -1,9 +1,9 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; @AllArgsConstructor @Getter @ToString public class GemtextText extends AbstractGemtextLine { diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextTextLiteral.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextTextLiteral.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextTextLiteral.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextTextLiteral.java index 7e44702f..d2a6698e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/line/GemtextTextLiteral.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/line/GemtextTextLiteral.java @@ -1,9 +1,9 @@ -package nu.marginalia.gemini.gmi.line; +package nu.marginalia.memex.gemini.gmi.line; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextAsideParser.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextAsideParser.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextAsideParser.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextAsideParser.java index 541ada0c..3038111d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextAsideParser.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextAsideParser.java @@ -1,7 +1,7 @@ -package nu.marginalia.gemini.gmi.parser; +package nu.marginalia.memex.gemini.gmi.parser; -import nu.marginalia.gemini.gmi.line.GemtextAside; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.gemini.gmi.line.GemtextAside; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; import java.util.regex.Pattern; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextHeadingParser.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextHeadingParser.java similarity index 66% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextHeadingParser.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextHeadingParser.java index c91d2a45..9a0c5329 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextHeadingParser.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextHeadingParser.java @@ -1,9 +1,9 @@ -package nu.marginalia.gemini.gmi.parser; +package nu.marginalia.memex.gemini.gmi.parser; -import nu.marginalia.gemini.gmi.line.AbstractGemtextLine; -import nu.marginalia.gemini.gmi.line.GemtextHeading; -import nu.marginalia.gemini.gmi.line.GemtextText; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine; +import nu.marginalia.memex.gemini.gmi.line.GemtextHeading; +import nu.marginalia.memex.gemini.gmi.line.GemtextText; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; import java.util.regex.Pattern; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextLinkParser.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextLinkParser.java similarity index 67% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextLinkParser.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextLinkParser.java index 16ca2ec6..04b5353a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextLinkParser.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextLinkParser.java @@ -1,12 +1,12 @@ -package nu.marginalia.gemini.gmi.parser; +package nu.marginalia.memex.gemini.gmi.parser; -import nu.marginalia.gemini.gmi.line.AbstractGemtextLine; -import nu.marginalia.gemini.gmi.line.GemtextLink; -import nu.marginalia.gemini.gmi.line.GemtextText; -import nu.marginalia.wmsa.memex.model.MemexExternalUrl; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.model.MemexUrl; +import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine; +import nu.marginalia.memex.gemini.gmi.line.GemtextLink; +import nu.marginalia.memex.gemini.gmi.line.GemtextText; +import nu.marginalia.memex.memex.model.MemexExternalUrl; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexUrl; import javax.annotation.Nullable; import java.util.regex.Pattern; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextListParser.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextListParser.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextListParser.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextListParser.java index 8416895e..8a93de29 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextListParser.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextListParser.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini.gmi.parser; +package nu.marginalia.memex.gemini.gmi.parser; import java.util.regex.Pattern; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextParser.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextParser.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextParser.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextParser.java index ec15be17..d1b63a69 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextParser.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextParser.java @@ -1,8 +1,8 @@ -package nu.marginalia.gemini.gmi.parser; +package nu.marginalia.memex.gemini.gmi.parser; -import nu.marginalia.gemini.gmi.line.*; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeTaskId; +import nu.marginalia.memex.gemini.gmi.line.*; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeTaskId; import java.util.*; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextPragmaParser.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextPragmaParser.java similarity index 62% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextPragmaParser.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextPragmaParser.java index 192c4ba6..f96581cc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextPragmaParser.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextPragmaParser.java @@ -1,9 +1,9 @@ -package nu.marginalia.gemini.gmi.parser; +package nu.marginalia.memex.gemini.gmi.parser; -import nu.marginalia.gemini.gmi.line.AbstractGemtextLine; -import nu.marginalia.gemini.gmi.line.GemtextPragma; -import nu.marginalia.gemini.gmi.line.GemtextText; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine; +import nu.marginalia.memex.gemini.gmi.line.GemtextPragma; +import nu.marginalia.memex.gemini.gmi.line.GemtextText; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; import java.util.regex.Pattern; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextQuoteParser.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextQuoteParser.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextQuoteParser.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextQuoteParser.java index af72b3c9..b4468ea3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextQuoteParser.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextQuoteParser.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini.gmi.parser; +package nu.marginalia.memex.gemini.gmi.parser; import java.util.regex.Pattern; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextTaskParser.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextTaskParser.java similarity index 62% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextTaskParser.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextTaskParser.java index d9b95f2e..08b1b803 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/parser/GemtextTaskParser.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/parser/GemtextTaskParser.java @@ -1,11 +1,11 @@ -package nu.marginalia.gemini.gmi.parser; +package nu.marginalia.memex.gemini.gmi.parser; -import nu.marginalia.gemini.gmi.line.AbstractGemtextLine; -import nu.marginalia.gemini.gmi.line.GemtextTask; -import nu.marginalia.gemini.gmi.line.GemtextText; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeTaskId; -import nu.marginalia.wmsa.memex.model.MemexTaskTags; +import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine; +import nu.marginalia.memex.gemini.gmi.line.GemtextTask; +import nu.marginalia.memex.gemini.gmi.line.GemtextText; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeTaskId; +import nu.marginalia.memex.memex.model.MemexTaskTags; import java.util.regex.Pattern; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/renderer/GemtextRenderer.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/renderer/GemtextRenderer.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/renderer/GemtextRenderer.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/renderer/GemtextRenderer.java index 1697c8df..ccfa1fd7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/renderer/GemtextRenderer.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/renderer/GemtextRenderer.java @@ -1,6 +1,6 @@ -package nu.marginalia.gemini.gmi.renderer; +package nu.marginalia.memex.gemini.gmi.renderer; -import nu.marginalia.gemini.gmi.line.*; +import nu.marginalia.memex.gemini.gmi.line.*; import java.util.function.Function; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/renderer/GemtextRendererFactory.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/renderer/GemtextRendererFactory.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/renderer/GemtextRendererFactory.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/renderer/GemtextRendererFactory.java index 257cfc1c..0e75d047 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/gmi/renderer/GemtextRendererFactory.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/renderer/GemtextRendererFactory.java @@ -1,8 +1,8 @@ -package nu.marginalia.gemini.gmi.renderer; +package nu.marginalia.memex.gemini.gmi.renderer; -import nu.marginalia.gemini.gmi.line.*; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.model.MemexUrl; +import nu.marginalia.memex.gemini.gmi.line.*; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexUrl; import org.apache.logging.log4j.util.Strings; import java.util.Objects; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiConnection.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiConnection.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiConnection.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiConnection.java index 6d032a2e..3278ec2f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiConnection.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiConnection.java @@ -1,7 +1,7 @@ -package nu.marginalia.gemini.io; +package nu.marginalia.memex.gemini.io; -import nu.marginalia.gemini.BadBotList; -import nu.marginalia.gemini.plugins.FileType; +import nu.marginalia.memex.gemini.BadBotList; +import nu.marginalia.memex.gemini.plugins.FileType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiSSLSetUp.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiSSLSetUp.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiSSLSetUp.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiSSLSetUp.java index 525515f3..cd8afa3e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiSSLSetUp.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiSSLSetUp.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini.io; +package nu.marginalia.memex.gemini.io; import com.google.inject.Inject; import com.google.inject.name.Named; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiStatusCode.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiStatusCode.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiStatusCode.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiStatusCode.java index f201e331..b3b205eb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiStatusCode.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiStatusCode.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini.io; +package nu.marginalia.memex.gemini.io; public class GeminiStatusCode { public static final int INPUT = 10; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiUserException.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiUserException.java similarity index 82% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiUserException.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiUserException.java index 937da4fe..12022d56 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/io/GeminiUserException.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/io/GeminiUserException.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini.io; +package nu.marginalia.memex.gemini.io; /** Throw to report message to user */ public class GeminiUserException extends RuntimeException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/BareStaticPagePlugin.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/BareStaticPagePlugin.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/BareStaticPagePlugin.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/BareStaticPagePlugin.java index 46bdfb7d..d3a210a3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/BareStaticPagePlugin.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/BareStaticPagePlugin.java @@ -1,9 +1,9 @@ -package nu.marginalia.gemini.plugins; +package nu.marginalia.memex.gemini.plugins; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.gemini.GeminiService; -import nu.marginalia.gemini.io.GeminiConnection; +import nu.marginalia.memex.gemini.GeminiService; +import nu.marginalia.memex.gemini.io.GeminiConnection; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/FileType.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/FileType.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/FileType.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/FileType.java index 587a9894..f8472f9d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/FileType.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/FileType.java @@ -1,4 +1,4 @@ -package nu.marginalia.gemini.plugins; +package nu.marginalia.memex.gemini.plugins; import java.nio.file.Path; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/Plugin.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/Plugin.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/Plugin.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/Plugin.java index 3765e1ca..b0ae7f0f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/Plugin.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/Plugin.java @@ -1,7 +1,7 @@ -package nu.marginalia.gemini.plugins; +package nu.marginalia.memex.gemini.plugins; -import nu.marginalia.gemini.io.GeminiConnection; -import nu.marginalia.gemini.io.GeminiUserException; +import nu.marginalia.memex.gemini.io.GeminiConnection; +import nu.marginalia.memex.gemini.io.GeminiUserException; import java.io.IOException; import java.net.URI; diff --git a/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/SearchPlugin.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/SearchPlugin.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/SearchPlugin.java rename to other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/SearchPlugin.java index e0122d75..95f52bd5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/gemini/plugins/SearchPlugin.java +++ b/other/memex/src/main/java/nu/marginalia/memex/gemini/plugins/SearchPlugin.java @@ -1,8 +1,8 @@ -package nu.marginalia.gemini.plugins; +package nu.marginalia.memex.gemini.plugins; import com.google.inject.Inject; -import nu.marginalia.gemini.io.GeminiConnection; -import nu.marginalia.gemini.io.GeminiStatusCode; +import nu.marginalia.memex.gemini.io.GeminiConnection; +import nu.marginalia.memex.gemini.io.GeminiStatusCode; import org.apache.http.HttpHost; import org.apache.http.client.methods.HttpGet; import org.apache.http.conn.routing.HttpRoute; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/Memex.java b/other/memex/src/main/java/nu/marginalia/memex/memex/Memex.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/Memex.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/Memex.java index febdc5af..5376f86c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/Memex.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/Memex.java @@ -1,22 +1,22 @@ -package nu.marginalia.wmsa.memex; +package nu.marginalia.memex.memex; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.gemini.GeminiService; -import nu.marginalia.gemini.gmi.GemtextDatabase; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.util.graphics.dithering.FloydSteinbergDither; -import nu.marginalia.util.graphics.dithering.Palettes; -import nu.marginalia.wmsa.memex.change.GemtextTombstoneUpdateCaclulator; -import nu.marginalia.wmsa.memex.model.MemexImage; -import nu.marginalia.wmsa.memex.model.MemexNode; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.renderer.MemexRendererers; -import nu.marginalia.wmsa.memex.system.MemexFileSystemMonitor; -import nu.marginalia.wmsa.memex.system.MemexFileWriter; -import nu.marginalia.wmsa.memex.system.git.MemexGitRepo; +import nu.marginalia.memex.gemini.GeminiService; +import nu.marginalia.memex.gemini.gmi.GemtextDatabase; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.util.dithering.FloydSteinbergDither; +import nu.marginalia.memex.util.dithering.Palettes; +import nu.marginalia.memex.memex.change.GemtextTombstoneUpdateCaclulator; +import nu.marginalia.memex.memex.model.MemexImage; +import nu.marginalia.memex.memex.model.MemexNode; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.renderer.MemexRendererers; +import nu.marginalia.memex.memex.system.MemexFileSystemMonitor; +import nu.marginalia.memex.memex.system.MemexFileWriter; +import nu.marginalia.memex.memex.system.git.MemexGitRepo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexConfigurationModule.java b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexConfigurationModule.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexConfigurationModule.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/MemexConfigurationModule.java index 2533a9d1..f0cb1bac 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexConfigurationModule.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexConfigurationModule.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex; +package nu.marginalia.memex.memex; import com.google.inject.AbstractModule; import com.google.inject.Inject; @@ -6,13 +6,13 @@ import com.google.inject.Provider; import com.google.inject.name.Named; import com.google.inject.name.Names; import lombok.SneakyThrows; -import nu.marginalia.gemini.GeminiService; -import nu.marginalia.gemini.GeminiServiceDummy; -import nu.marginalia.gemini.GeminiServiceImpl; -import nu.marginalia.wmsa.memex.system.MemexFileWriter; -import nu.marginalia.wmsa.memex.system.git.MemexGitRepo; -import nu.marginalia.wmsa.memex.system.git.MemexGitRepoDummy; -import nu.marginalia.wmsa.memex.system.git.MemexGitRepoImpl; +import nu.marginalia.memex.gemini.GeminiService; +import nu.marginalia.memex.gemini.GeminiServiceDummy; +import nu.marginalia.memex.gemini.GeminiServiceImpl; +import nu.marginalia.memex.memex.system.MemexFileWriter; +import nu.marginalia.memex.memex.system.git.MemexGitRepo; +import nu.marginalia.memex.memex.system.git.MemexGitRepoDummy; +import nu.marginalia.memex.memex.system.git.MemexGitRepoImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexData.java b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexData.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexData.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/MemexData.java index 22c20f8f..42bb42ba 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexData.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexData.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.memex; +package nu.marginalia.memex.memex; import com.google.inject.Singleton; -import nu.marginalia.gemini.gmi.GemtextDatabase; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.wmsa.memex.model.MemexLink; -import nu.marginalia.wmsa.memex.model.MemexImage; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.model.fs.MemexFileSystem; +import nu.marginalia.memex.gemini.gmi.GemtextDatabase; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.memex.model.MemexImage; +import nu.marginalia.memex.memex.model.MemexLink; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.fs.MemexFileSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexLinks.java b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexLinks.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexLinks.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/MemexLinks.java index 8d491494..68168baa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexLinks.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexLinks.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.memex; +package nu.marginalia.memex.memex; -import nu.marginalia.wmsa.memex.model.MemexLink; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexLink; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.util.*; import java.util.stream.Collectors; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexLoader.java b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexLoader.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexLoader.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/MemexLoader.java index f5f6b29b..8945cb77 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexLoader.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexLoader.java @@ -1,15 +1,15 @@ -package nu.marginalia.wmsa.memex; +package nu.marginalia.memex.memex; import com.google.common.collect.Sets; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.gemini.gmi.GemtextDatabase; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.wmsa.memex.model.MemexImage; -import nu.marginalia.wmsa.memex.model.MemexNode; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes; -import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem; +import nu.marginalia.memex.gemini.gmi.GemtextDatabase; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.memex.model.MemexImage; +import nu.marginalia.memex.memex.model.MemexNode; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.system.MemexFileSystemModifiedTimes; +import nu.marginalia.memex.memex.system.MemexSourceFileSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexMain.java b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexMain.java similarity index 54% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexMain.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/MemexMain.java index f46ce4d1..c5601380 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexMain.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexMain.java @@ -1,13 +1,14 @@ -package nu.marginalia.wmsa.memex; +package nu.marginalia.memex.memex; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.gemini.GeminiConfigurationModule; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.server.Initialization; +import nu.marginalia.memex.MemexServiceDescriptors; +import nu.marginalia.memex.gemini.GeminiConfigurationModule; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.server.Initialization; public class MemexMain extends MainClass { private final MemexService service; @@ -18,12 +19,12 @@ public class MemexMain extends MainClass { } public static void main(String... args) { - init(ServiceDescriptor.MEMEX, args); + MainClass.init(ServiceId.Other_Memex, args); Injector injector = Guice.createInjector( new MemexConfigurationModule(), new GeminiConfigurationModule(), - new ConfigurationModule()); + new ConfigurationModule(MemexServiceDescriptors.descriptors, ServiceId.Other_Memex)); injector.getInstance(MemexMain.class); injector.getInstance(Initialization.class).setReady(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexService.java b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexService.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexService.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/MemexService.java index 96c553da..5e799298 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/MemexService.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/MemexService.java @@ -1,21 +1,21 @@ -package nu.marginalia.wmsa.memex; +package nu.marginalia.memex.memex; import com.google.inject.Inject; import com.google.inject.name.Named; import lombok.SneakyThrows; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.gemini.gmi.renderer.GemtextRendererFactory; -import nu.marginalia.wmsa.auth.client.AuthClient; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.memex.change.GemtextMutation; -import nu.marginalia.wmsa.memex.change.update.GemtextDocumentUpdateCalculator; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.model.render.*; -import nu.marginalia.wmsa.memex.renderer.MemexHtmlRenderer; +import nu.marginalia.client.Context; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory; +import nu.marginalia.memex.auth.client.AuthClient; +import nu.marginalia.memex.memex.model.render.*; +import nu.marginalia.memex.memex.change.GemtextMutation; +import nu.marginalia.memex.memex.change.update.GemtextDocumentUpdateCalculator; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.renderer.MemexHtmlRenderer; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.MetricsServer; +import nu.marginalia.service.server.Service; import org.apache.http.HttpStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextAppend.java b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextAppend.java similarity index 83% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextAppend.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextAppend.java index be9c34dd..8f4e0b6e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextAppend.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextAppend.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.memex.change; +package nu.marginalia.memex.memex.change; import lombok.AllArgsConstructor; import lombok.ToString; -import nu.marginalia.wmsa.memex.Memex; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.gemini.gmi.renderer.GemtextRendererFactory; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.Memex; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextCreate.java b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextCreate.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextCreate.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextCreate.java index 764d54ce..f44c3875 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextCreate.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextCreate.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.memex.change; +package nu.marginalia.memex.memex.change; import lombok.AllArgsConstructor; import lombok.ToString; -import nu.marginalia.wmsa.memex.Memex; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.Memex; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextCreateOrMutate.java b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextCreateOrMutate.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextCreateOrMutate.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextCreateOrMutate.java index f115849f..7f60a759 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextCreateOrMutate.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextCreateOrMutate.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.memex.change; +package nu.marginalia.memex.memex.change; import lombok.AllArgsConstructor; import lombok.ToString; -import nu.marginalia.wmsa.memex.Memex; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.Memex; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextMutation.java b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextMutation.java similarity index 74% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextMutation.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextMutation.java index c9d73526..7612b1a0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextMutation.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextMutation.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.memex.change; +package nu.marginalia.memex.memex.change; -import nu.marginalia.wmsa.memex.Memex; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.Memex; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.io.IOException; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextPrepend.java b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextPrepend.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextPrepend.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextPrepend.java index d9415e4b..84f70ca8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextPrepend.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextPrepend.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.memex.change; +package nu.marginalia.memex.memex.change; import lombok.AllArgsConstructor; import lombok.ToString; -import nu.marginalia.wmsa.memex.Memex; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.gemini.gmi.renderer.GemtextRendererFactory; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.Memex; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextReplace.java b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextReplace.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextReplace.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextReplace.java index 3f5d7890..6e437fa2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextReplace.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextReplace.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.memex.change; +package nu.marginalia.memex.memex.change; import lombok.AllArgsConstructor; import lombok.ToString; -import nu.marginalia.wmsa.memex.Memex; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.gemini.gmi.renderer.GemtextRendererFactory; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.Memex; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulator.java b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextTombstoneUpdateCaclulator.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulator.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextTombstoneUpdateCaclulator.java index 711e1f55..83e03b31 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulator.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/change/GemtextTombstoneUpdateCaclulator.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.memex.change; +package nu.marginalia.memex.memex.change; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import com.google.inject.name.Named; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/update/GemtextDocumentUpdateCalculator.java b/other/memex/src/main/java/nu/marginalia/memex/memex/change/update/GemtextDocumentUpdateCalculator.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/update/GemtextDocumentUpdateCalculator.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/change/update/GemtextDocumentUpdateCalculator.java index 51142ed0..237ad448 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/update/GemtextDocumentUpdateCalculator.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/change/update/GemtextDocumentUpdateCalculator.java @@ -1,17 +1,17 @@ -package nu.marginalia.wmsa.memex.change.update; +package nu.marginalia.memex.memex.change.update; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.gemini.gmi.line.GemtextText; -import nu.marginalia.wmsa.memex.Memex; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.gemini.gmi.line.AbstractGemtextLine; -import nu.marginalia.gemini.gmi.line.GemtextHeading; -import nu.marginalia.gemini.gmi.renderer.GemtextRenderer; -import nu.marginalia.gemini.gmi.renderer.GemtextRendererFactory; -import nu.marginalia.wmsa.memex.change.*; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.gemini.gmi.line.GemtextText; +import nu.marginalia.memex.memex.Memex; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine; +import nu.marginalia.memex.gemini.gmi.line.GemtextHeading; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRenderer; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory; +import nu.marginalia.memex.memex.change.*; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/update/GemtextTaskExtractor.java b/other/memex/src/main/java/nu/marginalia/memex/memex/change/update/GemtextTaskExtractor.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/update/GemtextTaskExtractor.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/change/update/GemtextTaskExtractor.java index 4bdc1ce4..f85902a0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/update/GemtextTaskExtractor.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/change/update/GemtextTaskExtractor.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.memex.change.update; +package nu.marginalia.memex.memex.change.update; -import nu.marginalia.gemini.gmi.line.AbstractGemtextLine; -import nu.marginalia.gemini.gmi.line.GemtextTask; +import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine; +import nu.marginalia.memex.gemini.gmi.line.GemtextTask; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/update/GemtextTasksRewrite.java b/other/memex/src/main/java/nu/marginalia/memex/memex/change/update/GemtextTasksRewrite.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/update/GemtextTasksRewrite.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/change/update/GemtextTasksRewrite.java index c2266f86..0db13d26 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/change/update/GemtextTasksRewrite.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/change/update/GemtextTasksRewrite.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.memex.change.update; +package nu.marginalia.memex.memex.change.update; import lombok.Getter; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.gemini.gmi.line.AbstractGemtextLine; -import nu.marginalia.gemini.gmi.line.GemtextTask; -import nu.marginalia.wmsa.memex.Memex; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine; +import nu.marginalia.memex.gemini.gmi.line.GemtextTask; +import nu.marginalia.memex.memex.Memex; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; import org.jetbrains.annotations.NotNull; import java.time.LocalDate; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/GemtextSection.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/GemtextSection.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/GemtextSection.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/GemtextSection.java index 5fc3cc73..c2237841 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/GemtextSection.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/GemtextSection.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/GemtextSectionAction.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/GemtextSectionAction.java similarity index 60% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/GemtextSectionAction.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/GemtextSectionAction.java index 16cb8158..a55e4546 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/GemtextSectionAction.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/GemtextSectionAction.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; public enum GemtextSectionAction { REPLACE, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexExternalUrl.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexExternalUrl.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexExternalUrl.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexExternalUrl.java index 38775753..28784a10 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexExternalUrl.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexExternalUrl.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import lombok.AllArgsConstructor; import lombok.EqualsAndHashCode; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexImage.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexImage.java similarity index 83% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexImage.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexImage.java index e0184d03..d27a5e65 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexImage.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexImage.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexIndexTask.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexIndexTask.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexIndexTask.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexIndexTask.java index 38c79410..893b5854 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexIndexTask.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexIndexTask.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexLink.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexLink.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexLink.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexLink.java index c44eaa26..c03822ac 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexLink.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexLink.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNode.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNode.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNode.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNode.java index ddd01f82..0730ebfe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNode.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNode.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeHeadingId.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeHeadingId.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeHeadingId.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeHeadingId.java index 084e22fe..d5db6ffa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeHeadingId.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeHeadingId.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import lombok.EqualsAndHashCode; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeTaskId.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeTaskId.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeTaskId.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeTaskId.java index e40dccbf..4fd45946 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeTaskId.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeTaskId.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import lombok.EqualsAndHashCode; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeType.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeType.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeType.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeType.java index 6c645921..33bdbb5f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeType.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeType.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import lombok.AllArgsConstructor; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeUrl.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeUrl.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeUrl.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeUrl.java index 96d91bd0..c919b511 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexNodeUrl.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexNodeUrl.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import lombok.EqualsAndHashCode; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexTaskState.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexTaskState.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexTaskState.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexTaskState.java index 085ac7aa..128bb918 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexTaskState.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexTaskState.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; public enum MemexTaskState { DONE('/', true,"done"), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexTaskTags.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexTaskTags.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexTaskTags.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexTaskTags.java index e19819bc..b3906241 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexTaskTags.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexTaskTags.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import lombok.Getter; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexUrl.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexUrl.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexUrl.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexUrl.java index 14cff995..9597a997 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/MemexUrl.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/MemexUrl.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import java.util.Optional; import java.util.function.Consumer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/fs/MemexDirectory.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/fs/MemexDirectory.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/fs/MemexDirectory.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/fs/MemexDirectory.java index deacc639..a198faf8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/fs/MemexDirectory.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/fs/MemexDirectory.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.memex.model.fs; +package nu.marginalia.memex.memex.model.fs; import lombok.Getter; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.wmsa.memex.model.MemexImage; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.memex.model.MemexImage; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.util.HashMap; import java.util.Map; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/fs/MemexFileSystem.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/fs/MemexFileSystem.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/fs/MemexFileSystem.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/fs/MemexFileSystem.java index ffd31508..b5452f29 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/fs/MemexFileSystem.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/fs/MemexFileSystem.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.memex.model.fs; +package nu.marginalia.memex.memex.model.fs; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.wmsa.memex.model.MemexImage; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.memex.model.MemexImage; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.util.*; import java.util.concurrent.ConcurrentHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRenderCreateFormModel.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRenderCreateFormModel.java similarity index 76% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRenderCreateFormModel.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRenderCreateFormModel.java index eb59bcc1..c317ac82 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRenderCreateFormModel.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRenderCreateFormModel.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.memex.model.render; +package nu.marginalia.memex.memex.model.render; import lombok.Getter; import lombok.RequiredArgsConstructor; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.wmsa.memex.renderer.MemexHtmlRenderer; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.memex.renderer.MemexHtmlRenderer; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.util.Comparator; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRenderUpdateFormModel.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRenderUpdateFormModel.java similarity index 71% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRenderUpdateFormModel.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRenderUpdateFormModel.java index 9e32ed95..12ffefff 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRenderUpdateFormModel.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRenderUpdateFormModel.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.memex.model.render; +package nu.marginalia.memex.memex.model.render; import lombok.AllArgsConstructor; import lombok.Getter; -import nu.marginalia.wmsa.memex.renderer.MemexHtmlRenderer; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.renderer.MemexHtmlRenderer; +import nu.marginalia.memex.memex.model.MemexNodeUrl; @AllArgsConstructor @Getter public class MemexRenderUpdateFormModel implements MemexRendererableDirect { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRenderUploadFormModel.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRenderUploadFormModel.java similarity index 76% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRenderUploadFormModel.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRenderUploadFormModel.java index e098af52..99cd3d36 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRenderUploadFormModel.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRenderUploadFormModel.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.memex.model.render; +package nu.marginalia.memex.memex.model.render; import lombok.Getter; import lombok.RequiredArgsConstructor; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.wmsa.memex.renderer.MemexHtmlRenderer; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.memex.renderer.MemexHtmlRenderer; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.util.Comparator; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererDeleteFormModel.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererDeleteFormModel.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererDeleteFormModel.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererDeleteFormModel.java index 84760fc2..c0b71441 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererDeleteFormModel.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererDeleteFormModel.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.memex.model.render; +package nu.marginalia.memex.memex.model.render; import lombok.AllArgsConstructor; import lombok.Getter; -import nu.marginalia.wmsa.memex.renderer.MemexHtmlRenderer; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.renderer.MemexHtmlRenderer; +import nu.marginalia.memex.memex.model.MemexNodeUrl; @AllArgsConstructor @Getter public class MemexRendererDeleteFormModel implements MemexRendererableDirect { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererImageModel.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererImageModel.java similarity index 77% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererImageModel.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererImageModel.java index d5534117..79880110 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererImageModel.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererImageModel.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.memex.model.render; +package nu.marginalia.memex.memex.model.render; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.SneakyThrows; -import nu.marginalia.wmsa.memex.model.MemexLink; -import nu.marginalia.wmsa.memex.model.MemexImage; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexLink; +import nu.marginalia.memex.memex.model.MemexImage; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.nio.file.Files; import java.util.Base64; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererIndexModel.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererIndexModel.java similarity index 82% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererIndexModel.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererIndexModel.java index 2fed0510..13630727 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererIndexModel.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererIndexModel.java @@ -1,10 +1,13 @@ -package nu.marginalia.wmsa.memex.model.render; +package nu.marginalia.memex.memex.model.render; import lombok.Getter; import lombok.RequiredArgsConstructor; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.gemini.gmi.renderer.GemtextRendererFactory; -import nu.marginalia.wmsa.memex.model.*; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory; +import nu.marginalia.memex.memex.model.MemexImage; +import nu.marginalia.memex.memex.model.MemexIndexTask; +import nu.marginalia.memex.memex.model.MemexLink; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.util.Comparator; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererRenameFormModel.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererRenameFormModel.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererRenameFormModel.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererRenameFormModel.java index 6dcb62c4..464bca88 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererRenameFormModel.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererRenameFormModel.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.memex.model.render; +package nu.marginalia.memex.memex.model.render; import lombok.AllArgsConstructor; import lombok.Getter; -import nu.marginalia.wmsa.memex.renderer.MemexHtmlRenderer; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.renderer.MemexHtmlRenderer; +import nu.marginalia.memex.memex.model.MemexNodeUrl; @AllArgsConstructor @Getter public class MemexRendererRenameFormModel implements MemexRendererableDirect { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererTombstoneModel.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererTombstoneModel.java similarity index 66% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererTombstoneModel.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererTombstoneModel.java index 60907952..aebcb9b1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererTombstoneModel.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererTombstoneModel.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.memex.model.render; +package nu.marginalia.memex.memex.model.render; import lombok.AllArgsConstructor; import lombok.Getter; -import nu.marginalia.wmsa.memex.model.MemexLink; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexLink; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererViewModel.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererViewModel.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererViewModel.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererViewModel.java index f77e4bfd..8aac05e8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/model/render/MemexRendererViewModel.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererViewModel.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.memex.model.render; +package nu.marginalia.memex.memex.model.render; import lombok.AllArgsConstructor; import lombok.Getter; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.wmsa.memex.model.MemexLink; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.memex.model.MemexLink; import java.util.List; diff --git a/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererableDirect.java b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererableDirect.java new file mode 100644 index 00000000..b0631dfa --- /dev/null +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/model/render/MemexRendererableDirect.java @@ -0,0 +1,7 @@ +package nu.marginalia.memex.memex.model.render; + +import nu.marginalia.memex.memex.renderer.MemexHtmlRenderer; + +public interface MemexRendererableDirect { + String render(MemexHtmlRenderer renderer); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/renderer/MemexGmiRenderer.java b/other/memex/src/main/java/nu/marginalia/memex/memex/renderer/MemexGmiRenderer.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/renderer/MemexGmiRenderer.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/renderer/MemexGmiRenderer.java index e613d746..c74b498b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/renderer/MemexGmiRenderer.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/renderer/MemexGmiRenderer.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.memex.renderer; +package nu.marginalia.memex.memex.renderer; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.gemini.gmi.renderer.GemtextRendererFactory; -import nu.marginalia.wmsa.memex.MemexData; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.system.MemexFileWriter; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory; +import nu.marginalia.memex.memex.MemexData; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.system.MemexFileWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/renderer/MemexHtmlRenderer.java b/other/memex/src/main/java/nu/marginalia/memex/memex/renderer/MemexHtmlRenderer.java similarity index 82% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/renderer/MemexHtmlRenderer.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/renderer/MemexHtmlRenderer.java index a39e5763..6cfaada1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/renderer/MemexHtmlRenderer.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/renderer/MemexHtmlRenderer.java @@ -1,15 +1,17 @@ -package nu.marginalia.wmsa.memex.renderer; +package nu.marginalia.memex.memex.renderer; import com.google.inject.Inject; import com.google.inject.name.Named; import lombok.SneakyThrows; -import nu.marginalia.gemini.gmi.renderer.GemtextRendererFactory; -import nu.marginalia.wmsa.memex.MemexData; -import nu.marginalia.wmsa.memex.model.*; -import nu.marginalia.wmsa.memex.model.render.*; -import nu.marginalia.wmsa.memex.system.MemexFileWriter; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; +import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory; +import nu.marginalia.memex.memex.MemexData; +import nu.marginalia.memex.memex.model.MemexIndexTask; +import nu.marginalia.memex.memex.model.MemexLink; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.render.*; +import nu.marginalia.memex.memex.system.MemexFileWriter; +import nu.marginalia.memex.renderer.MustacheRenderer; +import nu.marginalia.memex.renderer.RendererFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -55,18 +57,18 @@ public class MemexHtmlRenderer { final var rendererFactory = new RendererFactory(); - viewRenderer = rendererFactory.renderer("memex/memex-view"); - indexRenderer = rendererFactory.renderer("memex/memex-index"); - indexFeedRenderer = rendererFactory.renderer("memex/memex-index-feed"); - imageRenderer = rendererFactory.renderer("memex/memex-image"); + viewRenderer = rendererFactory.renderer("static/memex/memex-view"); + indexRenderer = rendererFactory.renderer("static/memex/memex-index"); + indexFeedRenderer = rendererFactory.renderer("static/memex/memex-index-feed"); + imageRenderer = rendererFactory.renderer("static/memex/memex-image"); - tombstoneRenderer = rendererFactory.renderer("memex/memex-tombstone"); + tombstoneRenderer = rendererFactory.renderer("static/memex/memex-tombstone"); - updateFormRenderer = rendererFactory.renderer("memex/memex-update-form"); - uploadFormRenderer = rendererFactory.renderer("memex/memex-upload-form"); - deleteFormRenderer = rendererFactory.renderer("memex/memex-delete-form"); - renameFormRenderer = rendererFactory.renderer("memex/memex-rename-form"); - createFormRenderer = rendererFactory.renderer("memex/memex-create-form"); + updateFormRenderer = rendererFactory.renderer("static/memex/memex-update-form"); + uploadFormRenderer = rendererFactory.renderer("static/memex/memex-upload-form"); + deleteFormRenderer = rendererFactory.renderer("static/memex/memex-delete-form"); + renameFormRenderer = rendererFactory.renderer("static/memex/memex-rename-form"); + createFormRenderer = rendererFactory.renderer("static/memex/memex-create-form"); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/renderer/MemexRendererers.java b/other/memex/src/main/java/nu/marginalia/memex/memex/renderer/MemexRendererers.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/renderer/MemexRendererers.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/renderer/MemexRendererers.java index b033eabf..bffaaf26 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/renderer/MemexRendererers.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/renderer/MemexRendererers.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.memex.renderer; +package nu.marginalia.memex.memex.renderer; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexNodeUrl; @Singleton public class MemexRendererers { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexFileSystemModifiedTimes.java b/other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexFileSystemModifiedTimes.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexFileSystemModifiedTimes.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexFileSystemModifiedTimes.java index e0f3428e..ceb5f38f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexFileSystemModifiedTimes.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexFileSystemModifiedTimes.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.system; +package nu.marginalia.memex.memex.system; import com.google.inject.Singleton; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexFileSystemMonitor.java b/other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexFileSystemMonitor.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexFileSystemMonitor.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexFileSystemMonitor.java index bbcc2b4b..e5c1a94a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexFileSystemMonitor.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexFileSystemMonitor.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.memex.system; +package nu.marginalia.memex.memex.system; import com.google.inject.Inject; import com.google.inject.name.Named; import lombok.SneakyThrows; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -69,6 +69,7 @@ public class MemexFileSystemMonitor { @SneakyThrows + @SuppressWarnings("unchecked") private void monitorWatch() { for (;;) { var key = watchService.take(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexFileWriter.java b/other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexFileWriter.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexFileWriter.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexFileWriter.java index 577da3d9..285d149e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexFileWriter.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexFileWriter.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.memex.system; +package nu.marginalia.memex.memex.system; import nu.marginalia.util.FileSizeUtil; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexSourceFileSystem.java b/other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexSourceFileSystem.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexSourceFileSystem.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexSourceFileSystem.java index 9d165272..305a4c42 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/MemexSourceFileSystem.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/system/MemexSourceFileSystem.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.memex.system; +package nu.marginalia.memex.memex.system; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.system.git.MemexGitRepo; +import nu.marginalia.memex.memex.system.git.MemexGitRepo; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepo.java b/other/memex/src/main/java/nu/marginalia/memex/memex/system/git/MemexGitRepo.java similarity index 68% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepo.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/system/git/MemexGitRepo.java index d4e55491..a26375c9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepo.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/system/git/MemexGitRepo.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.memex.system.git; +package nu.marginalia.memex.memex.system.git; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexNodeUrl; public interface MemexGitRepo { void pull(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoDummy.java b/other/memex/src/main/java/nu/marginalia/memex/memex/system/git/MemexGitRepoDummy.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoDummy.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/system/git/MemexGitRepoDummy.java index 4d5116ff..84cc3406 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoDummy.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/system/git/MemexGitRepoDummy.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.memex.system.git; +package nu.marginalia.memex.memex.system.git; import com.google.inject.Singleton; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoImpl.java b/other/memex/src/main/java/nu/marginalia/memex/memex/system/git/MemexGitRepoImpl.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoImpl.java rename to other/memex/src/main/java/nu/marginalia/memex/memex/system/git/MemexGitRepoImpl.java index 10c72060..e60a9db0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/memex/system/git/MemexGitRepoImpl.java +++ b/other/memex/src/main/java/nu/marginalia/memex/memex/system/git/MemexGitRepoImpl.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.memex.system.git; +package nu.marginalia.memex.memex.system.git; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; import com.jcraft.jsch.JSch; import com.jcraft.jsch.JSchException; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import org.eclipse.jgit.api.Git; import org.eclipse.jgit.api.errors.GitAPIException; import org.eclipse.jgit.lib.Repository; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/mustache/MustacheRenderer.java b/other/memex/src/main/java/nu/marginalia/memex/renderer/MustacheRenderer.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/mustache/MustacheRenderer.java rename to other/memex/src/main/java/nu/marginalia/memex/renderer/MustacheRenderer.java index 2910ca74..1c6e7e92 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/mustache/MustacheRenderer.java +++ b/other/memex/src/main/java/nu/marginalia/memex/renderer/MustacheRenderer.java @@ -1,14 +1,14 @@ -package nu.marginalia.wmsa.renderer.mustache; +package nu.marginalia.memex.renderer; import com.github.jknack.handlebars.*; import com.github.jknack.handlebars.helper.ConditionalHelpers; import com.github.jknack.handlebars.io.ClassPathTemplateLoader; import com.github.jknack.handlebars.io.TemplateLoader; import lombok.SneakyThrows; -import nu.marginalia.gemini.gmi.GemtextDocument; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.model.render.MemexRendererIndexModel; -import nu.marginalia.wmsa.memex.model.render.MemexRendererViewModel; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.render.MemexRendererIndexModel; +import nu.marginalia.memex.memex.model.render.MemexRendererViewModel; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/other/memex/src/main/java/nu/marginalia/memex/renderer/RendererFactory.java b/other/memex/src/main/java/nu/marginalia/memex/renderer/RendererFactory.java new file mode 100644 index 00000000..96e61038 --- /dev/null +++ b/other/memex/src/main/java/nu/marginalia/memex/renderer/RendererFactory.java @@ -0,0 +1,13 @@ +package nu.marginalia.memex.renderer; + +import java.io.IOException; + +public class RendererFactory { + + public RendererFactory() { + } + + public MustacheRenderer renderer(String template) throws IOException { + return new MustacheRenderer<>(template); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/graphics/dithering/FloydSteinbergDither.java b/other/memex/src/main/java/nu/marginalia/memex/util/dithering/FloydSteinbergDither.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/util/graphics/dithering/FloydSteinbergDither.java rename to other/memex/src/main/java/nu/marginalia/memex/util/dithering/FloydSteinbergDither.java index 59d0f848..942d65b0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/graphics/dithering/FloydSteinbergDither.java +++ b/other/memex/src/main/java/nu/marginalia/memex/util/dithering/FloydSteinbergDither.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.graphics.dithering; +package nu.marginalia.memex.util.dithering; import lombok.AllArgsConstructor; import net.sf.image4j.util.ConvertUtil; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/graphics/dithering/Palettes.java b/other/memex/src/main/java/nu/marginalia/memex/util/dithering/Palettes.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/util/graphics/dithering/Palettes.java rename to other/memex/src/main/java/nu/marginalia/memex/util/dithering/Palettes.java index 1318993e..a8137c9a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/graphics/dithering/Palettes.java +++ b/other/memex/src/main/java/nu/marginalia/memex/util/dithering/Palettes.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.graphics.dithering; +package nu.marginalia.memex.util.dithering; public class Palettes { diff --git a/marginalia_nu/src/main/resources/static/memex/ico/dir.png b/other/memex/src/main/resources/static/memex/ico/dir.png similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/ico/dir.png rename to other/memex/src/main/resources/static/memex/ico/dir.png diff --git a/marginalia_nu/src/main/resources/static/memex/ico/doc32.png b/other/memex/src/main/resources/static/memex/ico/doc32.png similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/ico/doc32.png rename to other/memex/src/main/resources/static/memex/ico/doc32.png diff --git a/marginalia_nu/src/main/resources/static/memex/ico/file.png b/other/memex/src/main/resources/static/memex/ico/file.png similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/ico/file.png rename to other/memex/src/main/resources/static/memex/ico/file.png diff --git a/marginalia_nu/src/main/resources/static/memex/ico/folder32.png b/other/memex/src/main/resources/static/memex/ico/folder32.png similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/ico/folder32.png rename to other/memex/src/main/resources/static/memex/ico/folder32.png diff --git a/marginalia_nu/src/main/resources/static/memex/ico/folderup16.png b/other/memex/src/main/resources/static/memex/ico/folderup16.png similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/ico/folderup16.png rename to other/memex/src/main/resources/static/memex/ico/folderup16.png diff --git a/marginalia_nu/src/main/resources/static/memex/ico/nav16.png b/other/memex/src/main/resources/static/memex/ico/nav16.png similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/ico/nav16.png rename to other/memex/src/main/resources/static/memex/ico/nav16.png diff --git a/marginalia_nu/src/main/resources/static/memex/ico/pic16.png b/other/memex/src/main/resources/static/memex/ico/pic16.png similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/ico/pic16.png rename to other/memex/src/main/resources/static/memex/ico/pic16.png diff --git a/marginalia_nu/src/main/resources/static/memex/ico/pic32.png b/other/memex/src/main/resources/static/memex/ico/pic32.png similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/ico/pic32.png rename to other/memex/src/main/resources/static/memex/ico/pic32.png diff --git a/marginalia_nu/src/main/resources/static/memex/ico/root.png b/other/memex/src/main/resources/static/memex/ico/root.png similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/ico/root.png rename to other/memex/src/main/resources/static/memex/ico/root.png diff --git a/marginalia_nu/src/main/resources/static/memex/ico/shiba16.png b/other/memex/src/main/resources/static/memex/ico/shiba16.png similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/ico/shiba16.png rename to other/memex/src/main/resources/static/memex/ico/shiba16.png diff --git a/marginalia_nu/src/main/resources/static/memex/ico/world16.png b/other/memex/src/main/resources/static/memex/ico/world16.png similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/ico/world16.png rename to other/memex/src/main/resources/static/memex/ico/world16.png diff --git a/marginalia_nu/src/main/resources/static/memex/style-new.css b/other/memex/src/main/resources/static/memex/style-new.css similarity index 100% rename from marginalia_nu/src/main/resources/static/memex/style-new.css rename to other/memex/src/main/resources/static/memex/style-new.css diff --git a/marginalia_nu/src/main/resources/templates/auth/login.hdb b/other/memex/src/main/resources/templates/auth/login.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/auth/login.hdb rename to other/memex/src/main/resources/templates/auth/login.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/memex-create-form.hdb b/other/memex/src/main/resources/templates/memex/memex-create-form.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/memex-create-form.hdb rename to other/memex/src/main/resources/templates/memex/memex-create-form.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/memex-delete-form.hdb b/other/memex/src/main/resources/templates/memex/memex-delete-form.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/memex-delete-form.hdb rename to other/memex/src/main/resources/templates/memex/memex-delete-form.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/memex-image.hdb b/other/memex/src/main/resources/templates/memex/memex-image.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/memex-image.hdb rename to other/memex/src/main/resources/templates/memex/memex-image.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/memex-index-feed.hdb b/other/memex/src/main/resources/templates/memex/memex-index-feed.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/memex-index-feed.hdb rename to other/memex/src/main/resources/templates/memex/memex-index-feed.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/memex-index.hdb b/other/memex/src/main/resources/templates/memex/memex-index.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/memex-index.hdb rename to other/memex/src/main/resources/templates/memex/memex-index.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/memex-rename-form.hdb b/other/memex/src/main/resources/templates/memex/memex-rename-form.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/memex-rename-form.hdb rename to other/memex/src/main/resources/templates/memex/memex-rename-form.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/memex-tombstone.hdb b/other/memex/src/main/resources/templates/memex/memex-tombstone.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/memex-tombstone.hdb rename to other/memex/src/main/resources/templates/memex/memex-tombstone.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/memex-update-form.hdb b/other/memex/src/main/resources/templates/memex/memex-update-form.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/memex-update-form.hdb rename to other/memex/src/main/resources/templates/memex/memex-update-form.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/memex-upload-form.hdb b/other/memex/src/main/resources/templates/memex/memex-upload-form.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/memex-upload-form.hdb rename to other/memex/src/main/resources/templates/memex/memex-upload-form.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/memex-view.hdb b/other/memex/src/main/resources/templates/memex/memex-view.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/memex-view.hdb rename to other/memex/src/main/resources/templates/memex/memex-view.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/partial/memex-backlinks-inline.hdb b/other/memex/src/main/resources/templates/memex/partial/memex-backlinks-inline.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/partial/memex-backlinks-inline.hdb rename to other/memex/src/main/resources/templates/memex/partial/memex-backlinks-inline.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/partial/memex-backlinks.hdb b/other/memex/src/main/resources/templates/memex/partial/memex-backlinks.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/partial/memex-backlinks.hdb rename to other/memex/src/main/resources/templates/memex/partial/memex-backlinks.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/partial/memex-directories.hdb b/other/memex/src/main/resources/templates/memex/partial/memex-directories.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/partial/memex-directories.hdb rename to other/memex/src/main/resources/templates/memex/partial/memex-directories.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/partial/memex-documents-inline.hdb b/other/memex/src/main/resources/templates/memex/partial/memex-documents-inline.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/partial/memex-documents-inline.hdb rename to other/memex/src/main/resources/templates/memex/partial/memex-documents-inline.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/partial/memex-documents.hdb b/other/memex/src/main/resources/templates/memex/partial/memex-documents.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/partial/memex-documents.hdb rename to other/memex/src/main/resources/templates/memex/partial/memex-documents.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/partial/memex-footer.hdb b/other/memex/src/main/resources/templates/memex/partial/memex-footer.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/partial/memex-footer.hdb rename to other/memex/src/main/resources/templates/memex/partial/memex-footer.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/partial/memex-head.hdb b/other/memex/src/main/resources/templates/memex/partial/memex-head.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/partial/memex-head.hdb rename to other/memex/src/main/resources/templates/memex/partial/memex-head.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/partial/memex-images.hdb b/other/memex/src/main/resources/templates/memex/partial/memex-images.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/partial/memex-images.hdb rename to other/memex/src/main/resources/templates/memex/partial/memex-images.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/partial/memex-task-listing.hdb b/other/memex/src/main/resources/templates/memex/partial/memex-task-listing.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/partial/memex-task-listing.hdb rename to other/memex/src/main/resources/templates/memex/partial/memex-task-listing.hdb diff --git a/marginalia_nu/src/main/resources/templates/memex/partial/memex-topbar.hdb b/other/memex/src/main/resources/templates/memex/partial/memex-topbar.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/memex/partial/memex-topbar.hdb rename to other/memex/src/main/resources/templates/memex/partial/memex-topbar.hdb diff --git a/marginalia_nu/src/test/java/nu/marginalia/gemini/gmi/GemtextDatabaseTest.java b/other/memex/src/test/java/nu/marginalia/gmi/GemtextDatabaseTest.java similarity index 88% rename from marginalia_nu/src/test/java/nu/marginalia/gemini/gmi/GemtextDatabaseTest.java rename to other/memex/src/test/java/nu/marginalia/gmi/GemtextDatabaseTest.java index ba4587ed..3a9cf4dd 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/gemini/gmi/GemtextDatabaseTest.java +++ b/other/memex/src/test/java/nu/marginalia/gmi/GemtextDatabaseTest.java @@ -1,6 +1,7 @@ -package nu.marginalia.gemini.gmi; +package nu.marginalia.gmi; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; +import nu.marginalia.memex.gemini.gmi.GemtextDatabase; +import nu.marginalia.memex.memex.model.MemexNodeUrl; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/gemini/gmi/GemtextDocumentTest.java b/other/memex/src/test/java/nu/marginalia/gmi/GemtextDocumentTest.java similarity index 90% rename from marginalia_nu/src/test/java/nu/marginalia/gemini/gmi/GemtextDocumentTest.java rename to other/memex/src/test/java/nu/marginalia/gmi/GemtextDocumentTest.java index 5dd8f252..1edc5832 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/gemini/gmi/GemtextDocumentTest.java +++ b/other/memex/src/test/java/nu/marginalia/gmi/GemtextDocumentTest.java @@ -1,8 +1,10 @@ -package nu.marginalia.gemini.gmi; +package nu.marginalia.gmi; -import nu.marginalia.gemini.gmi.line.GemtextLink; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.model.MemexUrl; +import nu.marginalia.memex.gemini.gmi.GemtextDatabase; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.gemini.gmi.line.GemtextLink; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.model.MemexUrl; import org.junit.jupiter.api.Test; import java.util.Arrays; diff --git a/marginalia_nu/src/test/java/nu/marginalia/gemini/gmi/parser/GemtextTaskParserTest.java b/other/memex/src/test/java/nu/marginalia/gmi/parser/GemtextTaskParserTest.java similarity index 78% rename from marginalia_nu/src/test/java/nu/marginalia/gemini/gmi/parser/GemtextTaskParserTest.java rename to other/memex/src/test/java/nu/marginalia/gmi/parser/GemtextTaskParserTest.java index 62a62082..d63d5f4c 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/gemini/gmi/parser/GemtextTaskParserTest.java +++ b/other/memex/src/test/java/nu/marginalia/gmi/parser/GemtextTaskParserTest.java @@ -1,7 +1,8 @@ -package nu.marginalia.gemini.gmi.parser; +package nu.marginalia.gmi.parser; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeTaskId; +import nu.marginalia.memex.gemini.gmi.parser.GemtextTaskParser; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeTaskId; import org.junit.jupiter.api.Test; class GemtextTaskParserTest { diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/graphics/dithering/FloydSteinbergDitherTest.java b/other/memex/src/test/java/nu/marginalia/memex/dithering/FloydSteinbergDitherTest.java similarity index 89% rename from marginalia_nu/src/test/java/nu/marginalia/util/graphics/dithering/FloydSteinbergDitherTest.java rename to other/memex/src/test/java/nu/marginalia/memex/dithering/FloydSteinbergDitherTest.java index 632603bd..7d18977e 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/graphics/dithering/FloydSteinbergDitherTest.java +++ b/other/memex/src/test/java/nu/marginalia/memex/dithering/FloydSteinbergDitherTest.java @@ -1,5 +1,7 @@ -package nu.marginalia.util.graphics.dithering; +package nu.marginalia.memex.dithering; +import nu.marginalia.memex.util.dithering.FloydSteinbergDither; +import nu.marginalia.memex.util.dithering.Palettes; import org.junit.jupiter.api.Test; import javax.imageio.ImageIO; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/MemexFileWriterTest.java b/other/memex/src/test/java/nu/marginalia/memex/memex/MemexFileWriterTest.java similarity index 88% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/MemexFileWriterTest.java rename to other/memex/src/test/java/nu/marginalia/memex/memex/MemexFileWriterTest.java index fb240c25..32833242 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/MemexFileWriterTest.java +++ b/other/memex/src/test/java/nu/marginalia/memex/memex/MemexFileWriterTest.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.memex; +package nu.marginalia.memex.memex; import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.system.MemexFileWriter; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.system.MemexFileWriter; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/MemexTest.java b/other/memex/src/test/java/nu/marginalia/memex/memex/MemexTest.java similarity index 78% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/MemexTest.java rename to other/memex/src/test/java/nu/marginalia/memex/memex/MemexTest.java index 9ae437a2..54d80069 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/MemexTest.java +++ b/other/memex/src/test/java/nu/marginalia/memex/memex/MemexTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex; +package nu.marginalia.memex.memex; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java b/other/memex/src/test/java/nu/marginalia/memex/memex/change/GemtextChangeTest.java similarity index 93% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java rename to other/memex/src/test/java/nu/marginalia/memex/memex/change/GemtextChangeTest.java index e3e670c7..d6548042 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java +++ b/other/memex/src/test/java/nu/marginalia/memex/memex/change/GemtextChangeTest.java @@ -1,19 +1,19 @@ -package nu.marginalia.wmsa.memex.change; +package nu.marginalia.memex.memex.change; import io.reactivex.rxjava3.plugins.RxJavaPlugins; import lombok.SneakyThrows; -import nu.marginalia.gemini.GeminiServiceImpl; +import nu.marginalia.memex.gemini.GeminiServiceImpl; +import nu.marginalia.memex.memex.Memex; +import nu.marginalia.memex.memex.MemexData; +import nu.marginalia.memex.memex.MemexLoader; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.renderer.MemexRendererers; +import nu.marginalia.memex.memex.system.MemexFileSystemModifiedTimes; +import nu.marginalia.memex.memex.system.MemexFileWriter; +import nu.marginalia.memex.memex.system.MemexSourceFileSystem; +import nu.marginalia.memex.memex.system.git.MemexGitRepoImpl; import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.memex.Memex; -import nu.marginalia.wmsa.memex.MemexData; -import nu.marginalia.wmsa.memex.MemexLoader; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.renderer.MemexRendererers; -import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes; -import nu.marginalia.wmsa.memex.system.MemexFileWriter; -import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem; -import nu.marginalia.wmsa.memex.system.git.MemexGitRepoImpl; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java b/other/memex/src/test/java/nu/marginalia/memex/memex/change/GemtextTaskUpdateTest.java similarity index 90% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java rename to other/memex/src/test/java/nu/marginalia/memex/memex/change/GemtextTaskUpdateTest.java index d80d32eb..06d74be4 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java +++ b/other/memex/src/test/java/nu/marginalia/memex/memex/change/GemtextTaskUpdateTest.java @@ -1,21 +1,21 @@ -package nu.marginalia.wmsa.memex.change; +package nu.marginalia.memex.memex.change; import io.reactivex.rxjava3.plugins.RxJavaPlugins; import lombok.SneakyThrows; -import nu.marginalia.gemini.GeminiServiceImpl; -import nu.marginalia.gemini.gmi.GemtextDocument; +import nu.marginalia.memex.gemini.GeminiServiceImpl; +import nu.marginalia.memex.gemini.gmi.GemtextDocument; import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.memex.Memex; -import nu.marginalia.wmsa.memex.MemexData; -import nu.marginalia.wmsa.memex.MemexLoader; -import nu.marginalia.wmsa.memex.change.update.GemtextDocumentUpdateCalculator; -import nu.marginalia.wmsa.memex.model.MemexNodeHeadingId; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.renderer.MemexRendererers; -import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes; -import nu.marginalia.wmsa.memex.system.MemexFileWriter; -import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem; -import nu.marginalia.wmsa.memex.system.git.MemexGitRepoImpl; +import nu.marginalia.memex.memex.Memex; +import nu.marginalia.memex.memex.MemexData; +import nu.marginalia.memex.memex.MemexLoader; +import nu.marginalia.memex.memex.change.update.GemtextDocumentUpdateCalculator; +import nu.marginalia.memex.memex.model.MemexNodeHeadingId; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.renderer.MemexRendererers; +import nu.marginalia.memex.memex.system.MemexFileSystemModifiedTimes; +import nu.marginalia.memex.memex.system.MemexFileWriter; +import nu.marginalia.memex.memex.system.MemexSourceFileSystem; +import nu.marginalia.memex.memex.system.git.MemexGitRepoImpl; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java b/other/memex/src/test/java/nu/marginalia/memex/memex/change/GemtextTombstoneUpdateCaclulatorTest.java similarity index 84% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java rename to other/memex/src/test/java/nu/marginalia/memex/memex/change/GemtextTombstoneUpdateCaclulatorTest.java index 51120654..80e0b627 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java +++ b/other/memex/src/test/java/nu/marginalia/memex/memex/change/GemtextTombstoneUpdateCaclulatorTest.java @@ -1,18 +1,18 @@ -package nu.marginalia.wmsa.memex.change; +package nu.marginalia.memex.memex.change; import io.reactivex.rxjava3.plugins.RxJavaPlugins; import lombok.SneakyThrows; -import nu.marginalia.gemini.GeminiServiceImpl; +import nu.marginalia.memex.gemini.GeminiServiceImpl; import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.memex.Memex; -import nu.marginalia.wmsa.memex.MemexData; -import nu.marginalia.wmsa.memex.MemexLoader; -import nu.marginalia.wmsa.memex.model.MemexNodeUrl; -import nu.marginalia.wmsa.memex.renderer.MemexRendererers; -import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes; -import nu.marginalia.wmsa.memex.system.MemexFileWriter; -import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem; -import nu.marginalia.wmsa.memex.system.git.MemexGitRepoImpl; +import nu.marginalia.memex.memex.Memex; +import nu.marginalia.memex.memex.MemexData; +import nu.marginalia.memex.memex.MemexLoader; +import nu.marginalia.memex.memex.model.MemexNodeUrl; +import nu.marginalia.memex.memex.renderer.MemexRendererers; +import nu.marginalia.memex.memex.system.MemexFileSystemModifiedTimes; +import nu.marginalia.memex.memex.system.MemexFileWriter; +import nu.marginalia.memex.memex.system.MemexSourceFileSystem; +import nu.marginalia.memex.memex.system.git.MemexGitRepoImpl; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/model/MemexNodeHeadingIdTest.java b/other/memex/src/test/java/nu/marginalia/memex/memex/model/MemexNodeHeadingIdTest.java similarity index 96% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/model/MemexNodeHeadingIdTest.java rename to other/memex/src/test/java/nu/marginalia/memex/memex/model/MemexNodeHeadingIdTest.java index f0163b3b..73506861 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/model/MemexNodeHeadingIdTest.java +++ b/other/memex/src/test/java/nu/marginalia/memex/memex/model/MemexNodeHeadingIdTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.memex.model; +package nu.marginalia.memex.memex.model; import org.junit.jupiter.api.Test; diff --git a/other/memex/src/test/java/nu/marginalia/util/test/TestUtil.java b/other/memex/src/test/java/nu/marginalia/util/test/TestUtil.java new file mode 100644 index 00000000..c8f3735a --- /dev/null +++ b/other/memex/src/test/java/nu/marginalia/util/test/TestUtil.java @@ -0,0 +1,50 @@ +package nu.marginalia.util.test; + + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; + +public class TestUtil { + private static boolean isTempDir(Path dir) { + return dir.startsWith("/tmp") || dir.toString().contains("tmp"); + } + + public static void clearTempDir(Path dir) { + if (!isTempDir(dir)) { + throw new IllegalArgumentException("Refusing to recursively delete directory with that name"); + } + if (Files.isDirectory(dir)) { + for (File f : dir.toFile().listFiles()) { + File[] files = f.listFiles(); + if (files != null) { + Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); + } + System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); + f.delete(); + } + } + System.out.println("Deleting " + dir); + dir.toFile().delete(); + } + + private static String fileSize(Path path) { + try { + long sizeBytes = Files.size(path); + + if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; + if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; + if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; + return sizeBytes + "b"; + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + private static String round(double d) { + return String.format("%.2f", d); + } +} diff --git a/third_party/build.gradle b/other/wmsa_old/build.gradle similarity index 53% rename from third_party/build.gradle rename to other/wmsa_old/build.gradle index cc0cb57d..0c2bfb7c 100644 --- a/third_party/build.gradle +++ b/other/wmsa_old/build.gradle @@ -1,5 +1,8 @@ plugins { id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' } repositories { @@ -26,92 +29,86 @@ java { languageVersion.set(JavaLanguageVersion.of(17)) } } - dependencies { - implementation 'junit:junit:4.13.2' - testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' - testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:service') + implementation project(':common:service-discovery') + implementation project(':common:service-client') - implementation 'org.projectlombok:lombok:1.18.22' - annotationProcessor 'org.projectlombok:lombok:1.18.22' + implementation 'org.jetbrains:annotations:24.0.0' - testCompileOnly 'org.projectlombok:lombok:1.18.22' - testImplementation 'org.projectlombok:lombok:1.18.22' - testAnnotationProcessor 'org.projectlombok:lombok:1.18.22' - - implementation 'com.github.jknack:handlebars:4.3.0' + implementation 'org.projectlombok:lombok:1.18.24' + annotationProcessor 'org.projectlombok:lombok:1.18.24' + implementation 'com.github.jknack:handlebars:4.3.1' implementation 'com.github.jknack:handlebars-markdown:4.2.1' implementation group: 'com.google.code.gson', name: 'gson', version: '2.9.0' - implementation 'io.reactivex.rxjava3:rxjava:3.1.4' + implementation 'io.reactivex.rxjava3:rxjava:3.1.5' implementation "com.sparkjava:spark-core:2.9.3" - implementation 'com.opencsv:opencsv:5.6' - implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1' - implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1' + implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2' + implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2' implementation 'org.slf4j:slf4j-api:1.7.36' + testImplementation 'org.slf4j:slf4j-jdk14:2.0.3' implementation 'com.google.guava:guava:31.1-jre' implementation 'com.google.inject:guice:5.1.0' - implementation 'com.github.jnr:jnr-ffi:2.1.1' + implementation 'com.github.jnr:jnr-ffi:2.2.12' implementation 'org.apache.httpcomponents:httpcore:4.4.15' implementation 'org.apache.httpcomponents:httpclient:4.5.13' - implementation 'com.github.ThatJavaNerd:JRAW:1.1.0' - implementation group: 'com.h2database', name: 'h2', version: '2.1.210' - testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.3.1' - - implementation 'org.jsoup:jsoup:1.14.3' + implementation 'org.jsoup:jsoup:1.15.3' implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2' - implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.3' + implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.6' implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3' implementation 'com.zaxxer:HikariCP:5.0.1' implementation 'org.apache.opennlp:opennlp-tools:1.9.4' - implementation 'io.prometheus:simpleclient:0.15.0' - implementation 'io.prometheus:simpleclient_servlet:0.15.0' - implementation 'io.prometheus:simpleclient_httpserver:0.15.0' - implementation 'io.prometheus:simpleclient_hotspot:0.15.0' - implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.1' - implementation 'org.apache.opennlp:opennlp-tools:1.9.4' - implementation 'io.prometheus:simpleclient:0.15.0' - implementation 'io.prometheus:simpleclient_servlet:0.15.0' - implementation 'io.prometheus:simpleclient_httpserver:0.15.0' - implementation 'io.prometheus:simpleclient_hotspot:0.15.0' - implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.1' + implementation 'io.prometheus:simpleclient:0.16.0' + implementation 'io.prometheus:simpleclient_servlet:0.16.0' + implementation 'io.prometheus:simpleclient_httpserver:0.16.0' + implementation 'io.prometheus:simpleclient_hotspot:0.16.0' + implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3' implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30' - implementation 'com.syncthemall:boilerpipe:1.2.2' - implementation 'com.github.luben:zstd-jni:1.5.2-2' - implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.3.0' - implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14' - - implementation 'org.imgscalr:imgscalr-lib:4.2' - implementation 'org.jclarion:image4j:0.7' - - implementation 'commons-net:commons-net:3.6' + implementation 'commons-net:commons-net:3.8.0' implementation 'org.eclipse.jgit:org.eclipse.jgit:5.12.0.202106070339-r' implementation 'org.eclipse.jgit:org.eclipse.jgit.ssh.jsch:5.12.0.202106070339-r' - implementation 'com.jcraft:jsch:0.1.55' implementation group: 'org.apache.commons', name: 'commons-compress', version: '1.21' implementation 'edu.stanford.nlp:stanford-corenlp:4.4.0' implementation group: 'it.unimi.dsi', name: 'fastutil', version: '8.5.8' - implementation 'org.roaringbitmap:RoaringBitmap:[0.6,)' - implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29' + implementation 'org.roaringbitmap:RoaringBitmap:0.9.32' + implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29' implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0' + + testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' + testImplementation 'org.mockito:mockito-junit-jupiter:4.5.1' + testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' + testCompileOnly 'org.projectlombok:lombok:1.18.24' + testImplementation 'org.projectlombok:lombok:1.18.24' + testAnnotationProcessor 'org.projectlombok:lombok:1.18.24' + + testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1' + + implementation 'net.agkn:hll:1.6.0' + } test { useJUnitPlatform() } + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/other/wmsa_old/src/main/java/nu/marginalia/wmsa/WmsaHome.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/WmsaHome.java new file mode 100644 index 00000000..acf53e8f --- /dev/null +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/WmsaHome.java @@ -0,0 +1,56 @@ +package nu.marginalia.wmsa; + + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; +import java.util.Properties; + +public class WmsaHome { + private static final String DEFAULT = "/var/lib/wmsa"; + + public static Path getHomePath() { + var retStr = Optional.ofNullable(System.getenv("WMSA_HOME")).orElse(DEFAULT); + + var ret = Path.of(retStr); + if (!Files.isDirectory(ret)) { + throw new IllegalStateException("Could not find WMSA_HOME, either set environment variable or ensure " + DEFAULT + " exists"); + } + return ret; + } + + public static Path getDisk(String name) { + var pathStr = getDiskProperties().getProperty(name); + if (null == pathStr) { + throw new RuntimeException("Disk " + name + " was not configured"); + } + Path p = Path.of(pathStr); + if (!Files.isDirectory(p)) { + throw new RuntimeException("Disk " + name + " does not exist or is not a directory!"); + } + return p; + } + + public static Properties getDiskProperties() { + Path settingsFile = getHomePath().resolve("conf/disks.properties"); + + if (!Files.isRegularFile(settingsFile)) { + throw new RuntimeException("Could not find disk settings " + settingsFile); + } + + try (var is = Files.newInputStream(settingsFile)) { + var props = new Properties(); + props.load(is); + return props; + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + private static final boolean debugMode = Boolean.getBoolean("wmsa-debug"); + public static boolean isDebug() { + return debugMode; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/PodcastFetcher.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/PodcastFetcher.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/PodcastFetcher.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/PodcastFetcher.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperMain.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperMain.java similarity index 58% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperMain.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperMain.java index 22dfe1f3..8e006d96 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperMain.java +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperMain.java @@ -3,12 +3,11 @@ package nu.marginalia.wmsa.podcasts; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.server.Initialization; - -import java.io.IOException; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.wmsa.renderer.WmsaServiceDescriptors; public class PodcastScraperMain extends MainClass { @@ -20,10 +19,11 @@ public class PodcastScraperMain extends MainClass { } public static void main(String... args) { - init(ServiceDescriptor.PODCST_SCRAPER, args); + + init(ServiceId.Other_PodcastScraper, args); Injector injector = Guice.createInjector( - new ConfigurationModule()); + new ConfigurationModule(WmsaServiceDescriptors.descriptors, ServiceId.Other_PodcastScraper)); injector.getInstance(PodcastScraperMain.class); injector.getInstance(Initialization.class).setReady(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperService.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperService.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperService.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperService.java index a36ec3ce..8c60c8d8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperService.java +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/PodcastScraperService.java @@ -3,10 +3,10 @@ package nu.marginalia.wmsa.podcasts; import com.google.inject.Inject; import com.google.inject.name.Named; import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; +import nu.marginalia.client.Context; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.MetricsServer; +import nu.marginalia.service.server.Service; import nu.marginalia.wmsa.podcasts.model.Podcast; import nu.marginalia.wmsa.podcasts.model.PodcastEpisode; import nu.marginalia.wmsa.renderer.client.RendererClient; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/model/Podcast.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/model/Podcast.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/model/Podcast.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/model/Podcast.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastEpisode.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastEpisode.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastEpisode.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastEpisode.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastListing.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastListing.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastListing.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastListing.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastMetadata.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastMetadata.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastMetadata.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastMetadata.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastNewEpisodes.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastNewEpisodes.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastNewEpisodes.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/podcasts/model/PodcastNewEpisodes.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/PodcastRendererService.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/PodcastRendererService.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/PodcastRendererService.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/PodcastRendererService.java index cad1ad3e..3398a2ff 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/PodcastRendererService.java +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/PodcastRendererService.java @@ -3,12 +3,11 @@ package nu.marginalia.wmsa.renderer; import com.google.gson.Gson; import com.google.inject.Inject; import lombok.SneakyThrows; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.client.Context; import nu.marginalia.wmsa.podcasts.model.Podcast; -import nu.marginalia.wmsa.podcasts.model.PodcastEpisode; import nu.marginalia.wmsa.podcasts.model.PodcastListing; import nu.marginalia.wmsa.podcasts.model.PodcastNewEpisodes; +import nu.marginalia.wmsa.podcasts.model.PodcastEpisode; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; import nu.marginalia.wmsa.renderer.mustache.RendererFactory; import nu.marginalia.wmsa.resource_store.ResourceStoreClient; @@ -25,7 +24,7 @@ import java.util.concurrent.TimeUnit; public class PodcastRendererService { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final Gson gson = GsonFactory.get(); + private final Gson gson = new Gson(); private final RendererFactory rendererFactory = new RendererFactory(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererMain.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/RendererMain.java similarity index 63% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererMain.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/RendererMain.java index f3802a4f..98d7fd83 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererMain.java +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/RendererMain.java @@ -3,12 +3,10 @@ package nu.marginalia.wmsa.renderer; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.server.Initialization; - -import java.io.IOException; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.server.Initialization; public class RendererMain extends MainClass { private final RendererService service; @@ -20,11 +18,11 @@ public class RendererMain extends MainClass { } public static void main(String... args) { - init(ServiceDescriptor.RENDERER, args); + init(ServiceId.Other_Renderer, args); Injector injector = Guice.createInjector( new RendererModule(), - new ConfigurationModule()); + new ConfigurationModule(WmsaServiceDescriptors.descriptors, ServiceId.Other_Renderer)); injector.getInstance(RendererMain.class); injector.getInstance(Initialization.class).setReady(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererModule.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/RendererModule.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererModule.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/RendererModule.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererService.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/RendererService.java similarity index 76% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererService.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/RendererService.java index fb3d0e9e..86dfcc9a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererService.java +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/RendererService.java @@ -3,9 +3,9 @@ package nu.marginalia.wmsa.renderer; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.MetricsServer; +import nu.marginalia.service.server.Service; import nu.marginalia.wmsa.resource_store.ResourceStoreClient; @@ -18,7 +18,6 @@ public class RendererService extends Service { @Named("service-host") String ip, @Named("service-port") Integer port, PodcastRendererService podcastRendererService, - StatusRendererService statusRendererService, Initialization initialization, MetricsServer metricsServer ) { @@ -27,7 +26,6 @@ public class RendererService extends Service { this.resourceStoreClient = resourceStoreClient; podcastRendererService.start(); - statusRendererService.start(); } public boolean isReady() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/ServerStatusModel.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/ServerStatusModel.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/ServerStatusModel.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/ServerStatusModel.java diff --git a/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/WmsaServiceDescriptors.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/WmsaServiceDescriptors.java new file mode 100644 index 00000000..ca25440f --- /dev/null +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/WmsaServiceDescriptors.java @@ -0,0 +1,17 @@ +package nu.marginalia.wmsa.renderer; + +import nu.marginalia.service.descriptor.ServiceDescriptor; +import nu.marginalia.service.descriptor.ServiceDescriptors; +import nu.marginalia.service.id.ServiceId; + +import java.util.List; + +public class WmsaServiceDescriptors { + public static ServiceDescriptors descriptors = new ServiceDescriptors( + List.of( + new ServiceDescriptor(ServiceId.Other_ResourceStore, 5000), + new ServiceDescriptor(ServiceId.Other_Renderer, 5002), + new ServiceDescriptor(ServiceId.Other_PodcastScraper, 5013))); + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/client/RendererClient.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/client/RendererClient.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/client/RendererClient.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/client/RendererClient.java index 63537e3b..809a7cfa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/client/RendererClient.java +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/client/RendererClient.java @@ -1,25 +1,28 @@ package nu.marginalia.wmsa.renderer.client; +import com.google.gson.Gson; import io.reactivex.rxjava3.core.Observable; import lombok.SneakyThrows; -import nu.marginalia.wmsa.client.AbstractDynamicClient; -import nu.marginalia.wmsa.client.HttpStatusCode; -import nu.marginalia.wmsa.client.exception.TimeoutException; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.client.AbstractDynamicClient; +import nu.marginalia.client.Context; +import nu.marginalia.client.HttpStatusCode; +import nu.marginalia.service.descriptor.HostsFile; +import nu.marginalia.service.descriptor.ServiceDescriptors; +import nu.marginalia.service.id.ServiceId; import nu.marginalia.wmsa.podcasts.model.Podcast; import nu.marginalia.wmsa.podcasts.model.PodcastEpisode; import nu.marginalia.wmsa.podcasts.model.PodcastListing; import nu.marginalia.wmsa.podcasts.model.PodcastNewEpisodes; +import nu.marginalia.client.exception.TimeoutException; import javax.inject.Inject; import java.util.concurrent.TimeUnit; -public class RendererClient extends AbstractDynamicClient{ +public class RendererClient extends AbstractDynamicClient { @Inject - public RendererClient() { - super(ServiceDescriptor.RENDERER); + public RendererClient(ServiceDescriptors descriptors) { + super(descriptors.forId(ServiceId.Index), new HostsFile(), Gson::new); } @SneakyThrows diff --git a/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/mustache/MustacheRenderer.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/mustache/MustacheRenderer.java new file mode 100644 index 00000000..90e50754 --- /dev/null +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/mustache/MustacheRenderer.java @@ -0,0 +1,61 @@ +package nu.marginalia.wmsa.renderer.mustache; + +import com.github.jknack.handlebars.*; +import com.github.jknack.handlebars.helper.ConditionalHelpers; +import com.github.jknack.handlebars.io.ClassPathTemplateLoader; +import com.github.jknack.handlebars.io.TemplateLoader; +import lombok.SneakyThrows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public class MustacheRenderer { + Template template; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + MustacheRenderer(String templateFile) throws IOException { + + TemplateLoader loader = new ClassPathTemplateLoader(); + loader.setPrefix("/templates"); + loader.setSuffix(".hdb"); + + var handlebars = new Handlebars(loader); + handlebars.registerHelpers(ConditionalHelpers.class); + handlebars.registerHelper("md", new MarkdownHelper()); + + try { + template = handlebars.compile(templateFile); + } + catch (FileNotFoundException ex) { + logger.error("Kunde inte ladda template " + templateFile, ex); + System.exit(2); + } + catch (HandlebarsException ex) { + logger.error("Kunde inte instantiera mall " + templateFile, ex); + System.exit(2); + } + } + + @SneakyThrows + public String render(T model) { + return template.apply(model); + } + + @SneakyThrows + public String render(T model, String name, List children) { + Context ctx = Context.newBuilder(model).combine(name, children).build(); + + return template.apply(ctx); + } + + @SneakyThrows + public String render(T model, Map children) { + Context ctx = Context.newBuilder(model).combine(children).build(); + return template.apply(ctx); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/mustache/RendererFactory.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/mustache/RendererFactory.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/mustache/RendererFactory.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/renderer/mustache/RendererFactory.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceEntityStore.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceEntityStore.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceEntityStore.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceEntityStore.java index 5ca3d60b..ebfda218 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceEntityStore.java +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceEntityStore.java @@ -4,7 +4,6 @@ import com.google.gson.Gson; import com.google.inject.name.Named; import io.prometheus.client.Counter; import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.resource_store.model.RenderedResource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,7 +32,7 @@ public class ResourceEntityStore { private final Logger logger = LoggerFactory.getLogger(getClass()); private final Path dataPath; - private final Gson gson = GsonFactory.get(); + private final Gson gson = new Gson(); private final Base64.Encoder b64encoder = Base64.getEncoder(); private final static Counter wmsa_resource_store_count diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreClient.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreClient.java similarity index 71% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreClient.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreClient.java index b057d450..d9333638 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreClient.java +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreClient.java @@ -1,13 +1,16 @@ package nu.marginalia.wmsa.resource_store; +import com.google.gson.Gson; import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.client.AbstractDynamicClient; -import nu.marginalia.wmsa.client.HttpStatusCode; -import nu.marginalia.wmsa.client.exception.TimeoutException; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.client.AbstractDynamicClient; +import nu.marginalia.client.Context; +import nu.marginalia.client.HttpStatusCode; +import nu.marginalia.service.descriptor.HostsFile; +import nu.marginalia.service.descriptor.ServiceDescriptors; +import nu.marginalia.service.id.ServiceId; import nu.marginalia.wmsa.resource_store.model.RenderedResource; +import nu.marginalia.client.exception.TimeoutException; import javax.inject.Inject; import javax.inject.Singleton; @@ -16,11 +19,11 @@ import java.util.concurrent.TimeUnit; import java.util.function.Supplier; @Singleton -public class ResourceStoreClient extends AbstractDynamicClient{ +public class ResourceStoreClient extends AbstractDynamicClient { @Inject - public ResourceStoreClient() { - super(ServiceDescriptor.RESOURCE_STORE); + public ResourceStoreClient(ServiceDescriptors descriptors) { + super(descriptors.forId(ServiceId.Other_ResourceStore), new HostsFile(), Gson::new); } public Observable getResource(Context ctx, String domain, String resource) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreMain.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreMain.java similarity index 60% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreMain.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreMain.java index ad903fc5..f3e84dee 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreMain.java +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreMain.java @@ -3,12 +3,11 @@ package nu.marginalia.wmsa.resource_store; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.server.Initialization; - -import java.io.IOException; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.wmsa.renderer.WmsaServiceDescriptors; public class ResourceStoreMain extends MainClass { private final ResourceStoreService service; @@ -16,15 +15,14 @@ public class ResourceStoreMain extends MainClass { @Inject public ResourceStoreMain(ResourceStoreService service) { this.service = service; - } public static void main(String... args) { - init(ServiceDescriptor.RESOURCE_STORE, args); + init(ServiceId.Other_ResourceStore, args); Injector injector = Guice.createInjector( new ResourceStoreModule(), - new ConfigurationModule() + new ConfigurationModule(WmsaServiceDescriptors.descriptors, ServiceId.Other_ResourceStore) ); injector.getInstance(ResourceStoreMain.class); injector.getInstance(Initialization.class).setReady(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreModule.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreModule.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreModule.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreModule.java index 06443dc8..70db9271 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreModule.java +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreModule.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.resource_store; import com.google.inject.AbstractModule; import com.google.inject.name.Names; -import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.WmsaHome; import java.nio.file.Path; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreService.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreService.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreService.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreService.java index 7db4ffe7..45415275 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreService.java +++ b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreService.java @@ -4,14 +4,12 @@ import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.name.Named; import io.reactivex.rxjava3.schedulers.Schedulers; -import kotlin.text.Charsets; import lombok.SneakyThrows; -import nu.marginalia.wmsa.auth.client.AuthClient; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; +import nu.marginalia.client.Context; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.MetricsServer; +import nu.marginalia.service.server.Service; +import nu.marginalia.service.server.StaticResources; import nu.marginalia.wmsa.resource_store.model.RenderedResource; import org.apache.http.HttpStatus; import org.slf4j.Logger; @@ -22,31 +20,27 @@ import spark.Spark; import spark.resource.ClassPathResource; import spark.staticfiles.MimeType; -import java.net.URLEncoder; import java.time.LocalDateTime; import java.time.ZoneOffset; import java.util.concurrent.TimeUnit; public class ResourceStoreService extends Service { - private final Gson gson = GsonFactory.get(); + private final Gson gson = new Gson(); private final Logger logger = LoggerFactory.getLogger(getClass()); private final long startTime = LocalDateTime.now().toEpochSecond(ZoneOffset.UTC); - private final AuthClient authClient; private final ResourceEntityStore resourceStore; private StaticResources staticResources; @Inject public ResourceStoreService(@Named("service-host") String ip, @Named("service-port") Integer port, - AuthClient authClient, ResourceEntityStore resourceStore, Initialization initialization, MetricsServer metricsServer, StaticResources staticResources ) { super(ip, port, initialization, metricsServer); - this.authClient = authClient; this.resourceStore = resourceStore; this.staticResources = staticResources; @@ -107,7 +101,6 @@ public class ResourceStoreService extends Service { if (data != null) { logger.info("getResource({}/{}, {})", domain, resource, data.etag()); - validatePermission(Context.fromRequest(request), request, response, domain, data); return serveDynamic(data, request, response); } @@ -118,19 +111,6 @@ public class ResourceStoreService extends Service { return ""; } - - private void validatePermission(Context ctx, Request req, Response rsp, String domain, RenderedResource resource) { - if ("memex".equals(domain)) { - if (resource.requireLogin && !memexIsLoggedIn(ctx)) { - rsp.redirect("https://www.marginalia.nu/auth/login?service=MEMEX&redirect="+ URLEncoder.encode(req.headers("X-Extern-Url"), Charsets.UTF_8)); - Spark.halt(); - } - } - } - - private boolean memexIsLoggedIn(Context ctx) { - return authClient.isLoggedIn(ctx).timeout(1, TimeUnit.SECONDS).blockingFirst(); - } private String serveDynamic(RenderedResource data, Request request, Response response) { handleEtag(data, request, response); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/model/RenderedResource.java b/other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/model/RenderedResource.java similarity index 100% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/model/RenderedResource.java rename to other/wmsa_old/src/main/java/nu/marginalia/wmsa/resource_store/model/RenderedResource.java diff --git a/marginalia_nu/src/main/resources/static/podcast/style.css b/other/wmsa_old/src/main/resources/static/podcast/style.css similarity index 100% rename from marginalia_nu/src/main/resources/static/podcast/style.css rename to other/wmsa_old/src/main/resources/static/podcast/style.css diff --git a/marginalia_nu/src/main/resources/templates/podcast/episode.hdb b/other/wmsa_old/src/main/resources/templates/podcast/episode.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/podcast/episode.hdb rename to other/wmsa_old/src/main/resources/templates/podcast/episode.hdb diff --git a/marginalia_nu/src/main/resources/templates/podcast/listing.hdb b/other/wmsa_old/src/main/resources/templates/podcast/listing.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/podcast/listing.hdb rename to other/wmsa_old/src/main/resources/templates/podcast/listing.hdb diff --git a/marginalia_nu/src/main/resources/templates/podcast/new.hdb b/other/wmsa_old/src/main/resources/templates/podcast/new.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/podcast/new.hdb rename to other/wmsa_old/src/main/resources/templates/podcast/new.hdb diff --git a/marginalia_nu/src/main/resources/templates/podcast/podcast.hdb b/other/wmsa_old/src/main/resources/templates/podcast/podcast.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/podcast/podcast.hdb rename to other/wmsa_old/src/main/resources/templates/podcast/podcast.hdb diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/resource_store/ResourceStoreServiceTest.java b/other/wmsa_old/src/test/java/nu/marginalia/resource_store/ResourceStoreServiceTest.java similarity index 87% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/resource_store/ResourceStoreServiceTest.java rename to other/wmsa_old/src/test/java/nu/marginalia/resource_store/ResourceStoreServiceTest.java index 4325cff6..b1922bc1 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/resource_store/ResourceStoreServiceTest.java +++ b/other/wmsa_old/src/test/java/nu/marginalia/resource_store/ResourceStoreServiceTest.java @@ -1,9 +1,13 @@ -package nu.marginalia.wmsa.resource_store; +package nu.marginalia.resource_store; import lombok.SneakyThrows; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.configuration.server.Initialization; +import nu.marginalia.client.Context; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.StaticResources; +import nu.marginalia.wmsa.renderer.WmsaServiceDescriptors; +import nu.marginalia.wmsa.resource_store.ResourceEntityStore; +import nu.marginalia.wmsa.resource_store.ResourceStoreClient; +import nu.marginalia.wmsa.resource_store.ResourceStoreService; import nu.marginalia.wmsa.resource_store.model.RenderedResource; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; @@ -17,6 +21,7 @@ import java.io.File; import java.nio.file.Files; import java.nio.file.Path; import java.time.LocalDateTime; +import java.util.Random; import static org.junit.jupiter.api.Assertions.*; @@ -24,7 +29,7 @@ class ResourceStoreServiceTest { static ResourceStoreService service; static ResourceStoreClient client; - static final int testPort = TestUtil.getPort(); + static final int testPort = new Random().nextInt(4000, 10000); static ResourceEntityStore resourceStore; static Path tempDir; private static final Logger logger = LoggerFactory.getLogger(ResourceStoreServiceTest.class); @@ -35,11 +40,11 @@ class ResourceStoreServiceTest { Spark.port(testPort); System.setProperty("service-name", "renderer"); - client = new ResourceStoreClient(); + client = new ResourceStoreClient(WmsaServiceDescriptors.descriptors); client.setServiceRoute("127.0.0.1", testPort); tempDir = Files.createTempDirectory("ResourceStoreServiceTest"); resourceStore = new ResourceEntityStore(tempDir); - service = new ResourceStoreService("127.0.0.1", testPort, null, + service = new ResourceStoreService("127.0.0.1", testPort, resourceStore, new Initialization(), null, new StaticResources()); Spark.awaitInitialization(); diff --git a/run/.gitignore b/run/.gitignore new file mode 100644 index 00000000..72e8ffc0 --- /dev/null +++ b/run/.gitignore @@ -0,0 +1 @@ +* diff --git a/run/reconvert.sh b/run/reconvert.sh new file mode 100755 index 00000000..7f17f447 --- /dev/null +++ b/run/reconvert.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +set -e + +## Configuration + +SAMPLE_DIR="samples/crawl-l/" + +CONVERTER_PROCESS_OPTS=" +-Xmx16G +-XX:-CompactStrings +-XX:+UseParallelGC +-XX:GCTimeRatio=14 +-XX:ParallelGCThreads=15 +" + +LOADER_PROCESS_OPTS=" +-Dsmall-ram=TRUE +-Dlocal-index-path=vol/iw +" + +JAVA_OPTS=" +-Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR} +-Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true +" + +## Configuration ends + +pushd $(dirname $0) + +## Wipe the old index data + +rm -f ${SAMPLE_DIR}/process/process.log +rm -f vol/iw/dictionary.dat +rm -f vol/iw/index.dat + +## Upgrade the tools + +rm -rf install/loader-process install/converter-process +tar xf ../crawl/loading-process/build/distributions/loader-process.tar -C install/ +tar xf ../crawl/converting-process/build/distributions/converter-process.tar -C install/ + +PATH+=":install/converter-process/bin" +PATH+=":install/loader-process/bin" + +export WMSA_HOME=. +export PATH + +export JAVA_OPTS +export CONVERTER_PROCESS_OPTS +export LOADER_PROCESS_OPTS + +converter-process ${SAMPLE_DIR}/plan.yaml +loader-process ${SAMPLE_DIR}/plan.yaml + +mv vol/iw/index.dat vol/iw/0/page-index.dat + +popd diff --git a/services-core/assistant-service/build.gradle b/services-core/assistant-service/build.gradle new file mode 100644 index 00000000..a3aafc2e --- /dev/null +++ b/services-core/assistant-service/build.gradle @@ -0,0 +1,67 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'application' + id 'jvm-test-suite' + id 'com.palantir.docker' version '0.34.0' +} + +application { + mainClass = 'nu.marginalia.assistant.AssistantMain' + applicationName = 'assistant-service' +} + +apply from: "$rootProject.projectDir/docker-service.gradle" + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':api:assistant-api') + implementation project(':common:config') + implementation project(':common:service') + implementation project(':common:model') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + + implementation project(':features:screenshots') + implementation project(':libraries:language-processing') + + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guice + implementation libs.rxjava + implementation libs.spark + implementation libs.opencsv + implementation libs.trove + implementation libs.fastutil + implementation libs.bundles.gson + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + + +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantMain.java b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantMain.java new file mode 100644 index 00000000..4ecb82c0 --- /dev/null +++ b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantMain.java @@ -0,0 +1,34 @@ +package nu.marginalia.assistant; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.SearchServiceDescriptors; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.server.Initialization; + +public class AssistantMain extends MainClass { + private final AssistantService service; + + @Inject + public AssistantMain(AssistantService service) { + this.service = service; + } + + public static void main(String... args) { + init(ServiceId.Assistant, args); + + Injector injector = Guice.createInjector( + new AssistantModule(), + new ConfigurationModule(SearchServiceDescriptors.descriptors, ServiceId.Assistant), + new DatabaseModule() + ); + + injector.getInstance(AssistantMain.class); + injector.getInstance(Initialization.class).setReady(); + + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantModule.java b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantModule.java similarity index 63% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantModule.java rename to services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantModule.java index dcc8d90d..0670b103 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantModule.java +++ b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantModule.java @@ -1,14 +1,14 @@ -package nu.marginalia.wmsa.edge.assistant; +package nu.marginalia.assistant; import com.google.inject.AbstractModule; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.LanguageModels; +import nu.marginalia.WmsaHome; import java.nio.file.Path; import static com.google.inject.name.Names.named; -public class EdgeAssistantModule extends AbstractModule { +public class AssistantModule extends AbstractModule { public void configure() { bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("suggestions.txt")); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantService.java b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java similarity index 70% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantService.java rename to services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java index 975f11cb..c0d908fd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantService.java +++ b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java @@ -1,26 +1,25 @@ -package nu.marginalia.wmsa.edge.assistant; +package nu.marginalia.assistant; import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.name.Named; import lombok.SneakyThrows; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.edge.assistant.dict.DictionaryService; -import nu.marginalia.wmsa.edge.assistant.eval.MathParser; -import nu.marginalia.wmsa.edge.assistant.eval.Units; -import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; -import nu.marginalia.wmsa.edge.assistant.suggest.Suggestions; +import nu.marginalia.assistant.eval.Units; +import nu.marginalia.assistant.suggest.Suggestions; +import nu.marginalia.assistant.eval.MathParser; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.screenshot.ScreenshotService; +import nu.marginalia.assistant.dict.DictionaryService; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.MetricsServer; +import nu.marginalia.service.server.Service; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; import spark.Response; import spark.Spark; -public class EdgeAssistantService extends Service { - +public class AssistantService extends Service { private final Logger logger = LoggerFactory.getLogger(getClass()); private final Gson gson = GsonFactory.get(); private final Units units; @@ -29,15 +28,15 @@ public class EdgeAssistantService extends Service { @SneakyThrows @Inject - public EdgeAssistantService(@Named("service-host") String ip, - @Named("service-port") Integer port, - Initialization initialization, - MetricsServer metricsServer, - DictionaryService dictionaryService, - MathParser mathParser, - Units units, - ScreenshotService screenshotService, - Suggestions suggestions + public AssistantService(@Named("service-host") String ip, + @Named("service-port") Integer port, + Initialization initialization, + MetricsServer metricsServer, + DictionaryService dictionaryService, + MathParser mathParser, + Units units, + ScreenshotService screenshotService, + Suggestions suggestions ) { super(ip, port, initialization, metricsServer); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryService.java b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/dict/DictionaryService.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryService.java rename to services-core/assistant-service/src/main/java/nu/marginalia/assistant/dict/DictionaryService.java index 572c96fa..40686f74 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryService.java +++ b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/dict/DictionaryService.java @@ -1,13 +1,14 @@ -package nu.marginalia.wmsa.edge.assistant.dict; +package nu.marginalia.assistant.dict; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.assistant.client.model.DictionaryEntry; +import nu.marginalia.assistant.client.model.DictionaryResponse; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; -import java.util.stream.Collectors; @Singleton public class DictionaryService { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/SpellChecker.java b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/dict/SpellChecker.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/SpellChecker.java rename to services-core/assistant-service/src/main/java/nu/marginalia/assistant/dict/SpellChecker.java index 700a15ec..d1710122 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/SpellChecker.java +++ b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/dict/SpellChecker.java @@ -1,6 +1,7 @@ -package nu.marginalia.wmsa.edge.assistant.dict; +package nu.marginalia.assistant.dict; import com.google.inject.Singleton; +import symspell.SymSpell; import java.util.Comparator; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/eval/MathParser.java b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/MathParser.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/eval/MathParser.java rename to services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/MathParser.java index 37388a8f..247963c8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/eval/MathParser.java +++ b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/MathParser.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.assistant.eval; +package nu.marginalia.assistant.eval; import lombok.AllArgsConstructor; import lombok.SneakyThrows; @@ -46,6 +46,8 @@ public class MathParser { List tokens = tokenize(inputExpression); + // recursive descent + tokens = parenthesize(tokens); tokens = negate(tokens); tokens = functions(tokens); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/eval/Unit.java b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/Unit.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/eval/Unit.java rename to services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/Unit.java index e8da905e..cdc352c8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/eval/Unit.java +++ b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/Unit.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.assistant.eval; +package nu.marginalia.assistant.eval; public class Unit { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/eval/Units.java b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/Units.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/eval/Units.java rename to services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/Units.java index 6a0d4be8..b73eb7ac 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/eval/Units.java +++ b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/Units.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.assistant.eval; +package nu.marginalia.assistant.eval; import com.opencsv.CSVReader; import lombok.SneakyThrows; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java rename to services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java index ff793015..cb59ffd4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java +++ b/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.assistant.suggest; +package nu.marginalia.assistant.suggest; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; +import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.assistant.dict.SpellChecker; import org.apache.commons.collections4.trie.PatriciaTrie; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -42,6 +42,10 @@ public class Suggestions { } private static PatriciaTrie loadSuggestions(Path file) { + if (!Files.exists(file)) { + logger.error("Suggestions file {} absent, loading empty suggestions db", file); + return new PatriciaTrie<>(); + } try (var lines = Files.lines(file)) { var ret = new PatriciaTrie(); diff --git a/marginalia_nu/src/main/resources/units.csv b/services-core/assistant-service/src/main/resources/units.csv similarity index 100% rename from marginalia_nu/src/main/resources/units.csv rename to services-core/assistant-service/src/main/resources/units.csv diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleanerTest.java b/services-core/assistant-service/src/test/java/nu/marginalia/assistant/dict/WikiCleanerTest.java similarity index 98% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleanerTest.java rename to services-core/assistant-service/src/test/java/nu/marginalia/assistant/dict/WikiCleanerTest.java index 2535f206..3c2bda7a 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleanerTest.java +++ b/services-core/assistant-service/src/test/java/nu/marginalia/assistant/dict/WikiCleanerTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.assistant.dict; +package nu.marginalia.assistant.dict; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/eval/MathParserTest.java b/services-core/assistant-service/src/test/java/nu/marginalia/assistant/eval/MathParserTest.java similarity index 96% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/eval/MathParserTest.java rename to services-core/assistant-service/src/test/java/nu/marginalia/assistant/eval/MathParserTest.java index 45827fad..4fdfdc71 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/eval/MathParserTest.java +++ b/services-core/assistant-service/src/test/java/nu/marginalia/assistant/eval/MathParserTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.assistant.eval; +package nu.marginalia.assistant.eval; import org.junit.jupiter.api.Test; import org.slf4j.Logger; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/eval/UnitsTest.java b/services-core/assistant-service/src/test/java/nu/marginalia/assistant/eval/UnitsTest.java similarity index 96% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/eval/UnitsTest.java rename to services-core/assistant-service/src/test/java/nu/marginalia/assistant/eval/UnitsTest.java index 93d1efd2..de2b709b 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/eval/UnitsTest.java +++ b/services-core/assistant-service/src/test/java/nu/marginalia/assistant/eval/UnitsTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.assistant.eval; +package nu.marginalia.assistant.eval; import org.junit.jupiter.api.Test; diff --git a/services-core/index-service/build.gradle b/services-core/index-service/build.gradle new file mode 100644 index 00000000..a1e04b73 --- /dev/null +++ b/services-core/index-service/build.gradle @@ -0,0 +1,76 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'com.palantir.docker' version '0.34.0' + id 'application' + id 'jvm-test-suite' +} + +application { + mainClass = 'nu.marginalia.index.IndexMain' + applicationName = 'index-service' +} + +apply from: "$rootProject.projectDir/docker-service.gradle" + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:config') + implementation project(':common:model') + implementation project(':common:service') + implementation project(':api:index-api') + implementation project(':common:service-discovery') + + implementation project(':libraries:array') + implementation project(':libraries:btree') + implementation project(':libraries:misc') + + implementation project(':index:index-journal') + implementation project(':index:index-query') + implementation project(':index:index-forward') + implementation project(':index:index-reverse') + implementation project(':index:lexicon') + + implementation project(':features:domain-ranking') + + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guice + implementation libs.protobuf + implementation libs.bundles.httpcomponents + implementation libs.roaringbitmap + implementation libs.snakeyaml + implementation libs.rxjava + implementation libs.spark + implementation libs.opencsv + implementation libs.trove + implementation libs.fastutil + implementation libs.bundles.gson + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/services-core/index-service/src/main/java/nu/marginalia/index/IndexMain.java b/services-core/index-service/src/main/java/nu/marginalia/index/IndexMain.java new file mode 100644 index 00000000..35e3ae5e --- /dev/null +++ b/services-core/index-service/src/main/java/nu/marginalia/index/IndexMain.java @@ -0,0 +1,35 @@ +package nu.marginalia.index; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.SearchServiceDescriptors; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.server.Initialization; + +public class IndexMain extends MainClass { + private final IndexService service; + + @Inject + public IndexMain(IndexService service) { + this.service = service; + } + + public static void main(String... args) { + init(ServiceId.Index, args); + + Injector injector = Guice.createInjector( + new IndexTablesModule(), + new IndexModule(), + new DatabaseModule(), + new ConfigurationModule(SearchServiceDescriptors.descriptors, ServiceId.Index) + ); + + injector.getInstance(IndexMain.class); + injector.getInstance(Initialization.class).setReady(); + + } +} diff --git a/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java b/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java new file mode 100644 index 00000000..1e674d01 --- /dev/null +++ b/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java @@ -0,0 +1,38 @@ +package nu.marginalia.index; + +import com.google.inject.AbstractModule; +import com.google.inject.Provides; +import lombok.SneakyThrows; +import nu.marginalia.index.config.RankingSettings; +import nu.marginalia.WmsaHome; +import nu.marginalia.lexicon.KeywordLexicon; +import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; +import nu.marginalia.lexicon.journal.KeywordLexiconJournal; + +import java.nio.file.Path; + +public class IndexModule extends AbstractModule { + + + + public void configure() { + } + + @Provides + @SneakyThrows + private KeywordLexiconReadOnlyView createLexicon() { + return new KeywordLexiconReadOnlyView( + new KeywordLexicon( + new KeywordLexiconJournal(WmsaHome.getDisk("index-write").resolve("dictionary.dat").toFile() + ) + ) + ); + } + + @Provides + public RankingSettings rankingSettings() { + Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml"); + return RankingSettings.from(dir); + } + +} diff --git a/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java b/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java new file mode 100644 index 00000000..c6552c18 --- /dev/null +++ b/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java @@ -0,0 +1,76 @@ +package nu.marginalia.index; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.name.Named; +import io.reactivex.rxjava3.schedulers.Schedulers; +import nu.marginalia.index.index.SearchIndex; +import nu.marginalia.index.svc.IndexOpsService; +import nu.marginalia.index.svc.IndexQueryService; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.MetricsServer; +import nu.marginalia.service.server.Service; +import org.jetbrains.annotations.NotNull; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.util.concurrent.TimeUnit; + +import static spark.Spark.get; + +public class IndexService extends Service { + + @NotNull + private final Initialization init; + private final IndexOpsService opsService; + private final SearchIndex searchIndex; + + + @Inject + public IndexService(@Named("service-host") String ip, + @Named("service-port") Integer port, + Initialization init, + MetricsServer metricsServer, + IndexOpsService opsService, + IndexQueryService indexQueryService, + SearchIndex searchIndex + ) + { + super(ip, port, init, metricsServer); + this.opsService = opsService; + this.searchIndex = searchIndex; + + final Gson gson = GsonFactory.get(); + + this.init = init; + + Spark.post("/search/", indexQueryService::search, gson::toJson); + + Spark.post("/ops/repartition", opsService::repartitionEndpoint); + Spark.post("/ops/reindex", opsService::reindexEndpoint); + + get("/is-blocked", this::isBlocked, gson::toJson); + + Schedulers.newThread().scheduleDirect(this::initialize, 1, TimeUnit.MICROSECONDS); + } + + private Object isBlocked(Request request, Response response) { + return !initialized || opsService.isBusy(); + } + + volatile boolean initialized = false; + public void initialize() { + if (!initialized) { + init.waitReady(); + searchIndex.init(); + initialized = true; + } + + } + + +} + + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java similarity index 68% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java rename to services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java index dcc00e34..9b33cb8a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java @@ -1,26 +1,17 @@ -package nu.marginalia.wmsa.edge.index; +package nu.marginalia.index; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; -import lombok.SneakyThrows; -import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.dict.DictionaryMap; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; -import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.wmsa.edge.index.postings.DomainRankings; -import nu.marginalia.wmsa.edge.index.postings.SearchIndex; -import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter; -import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexReader; -import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; -import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexConverter; -import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader; -import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters; -import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; +import nu.marginalia.index.forward.ForwardIndexConverter; +import nu.marginalia.index.forward.ForwardIndexReader; +import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; +import nu.marginalia.index.reverse.ReverseIndexConverter; +import nu.marginalia.index.reverse.ReverseIndexPrioReader; +import nu.marginalia.index.reverse.ReverseIndexPriorityParameters; +import nu.marginalia.index.reverse.ReverseIndexReader; +import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.index.index.SearchIndexReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -38,7 +29,6 @@ public class IndexServicesFactory { private final Logger logger = LoggerFactory.getLogger(getClass()); private final PartitionedDataFile writerIndexFile; - private final RootDataFile keywordLexiconFile; private final PartitionedDataFile fwdIndexDocId; private final PartitionedDataFile fwdIndexDocData; private final PartitionedDataFile revIndexDoc; @@ -47,8 +37,6 @@ public class IndexServicesFactory { private final PartitionedDataFile revPrioIndexDoc; private final PartitionedDataFile revPrioIndexWords; - private volatile static KeywordLexicon keywordLexicon; - private final Path searchSetsBase; int LIVE_PART = 0; @@ -64,7 +52,6 @@ public class IndexServicesFactory { this.tmpFileDir = tmpFileDir; this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat"); - this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat"); fwdIndexDocId = new PartitionedDataFile(partitionRootFast, "fwd-doc-id.dat"); fwdIndexDocData = new PartitionedDataFile(partitionRootFast, "fwd-doc-data.dat"); @@ -85,39 +72,19 @@ public class IndexServicesFactory { return searchSetsBase; } - public SearchIndexJournalWriterImpl getIndexWriter(int idx) { - return new SearchIndexJournalWriterImpl(getKeywordLexicon(), writerIndexFile.get(idx)); - } - - @SneakyThrows - public KeywordLexicon getKeywordLexicon() { - if (keywordLexicon == null) { - final var journal = new KeywordLexiconJournal(keywordLexiconFile.get()); - keywordLexicon = new KeywordLexicon(journal, DictionaryMap.create()); - } - return keywordLexicon; - } - - @SneakyThrows - public KeywordLexiconReadOnlyView getDictionaryReader() { - return new KeywordLexiconReadOnlyView(getKeywordLexicon()); - - } - public void convertIndex(DomainRankings domainRankings) throws IOException { convertForwardIndex(domainRankings); convertFullReverseIndex(domainRankings); convertPriorityReverseIndex(domainRankings); - - } private void convertFullReverseIndex(DomainRankings domainRankings) throws IOException { - logger.info("Converting full reverse index"); + var source = writerIndexFile.get(0).toPath(); - var longArray = LongArray.mmapRead(writerIndexFile.get(0).toPath()); - var journalReader = new SearchIndexJournalReaderSingleFile(longArray); + logger.info("Converting full reverse index {}", source); + + var journalReader = new IndexJournalReaderSingleCompressedFile(source); var converter = new ReverseIndexConverter(tmpFileDir, journalReader, domainRankings, @@ -131,11 +98,11 @@ public class IndexServicesFactory { private void convertPriorityReverseIndex(DomainRankings domainRankings) throws IOException { - logger.info("Converting priority reverse index"); + var source = writerIndexFile.get(0).toPath(); - var longArray = LongArray.mmapRead(writerIndexFile.get(0).toPath()); + logger.info("Converting priority reverse index {}", source); - var journalReader = new SearchIndexJournalReaderSingleFile(longArray, null, ReverseIndexPriorityParameters::filterPriorityRecord); + var journalReader = new IndexJournalReaderSingleCompressedFile(source, null, ReverseIndexPriorityParameters::filterPriorityRecord); var converter = new ReverseIndexConverter(tmpFileDir, journalReader, @@ -149,10 +116,12 @@ public class IndexServicesFactory { } private void convertForwardIndex(DomainRankings domainRankings) throws IOException { - logger.info("Converting forward index data"); - new ForwardIndexConverter( - writerIndexFile.get(0), + var source = writerIndexFile.get(0); + + logger.info("Converting forward index data {}", source); + + new ForwardIndexConverter(source, fwdIndexDocId.get(NEXT_PART).toPath(), fwdIndexDocData.get(NEXT_PART).toPath(), domainRankings) @@ -213,10 +182,6 @@ public class IndexServicesFactory { } } - public SearchIndex createIndexBucket(EdgeIndexSearchSetsService searchSetsService) { - return new SearchIndex(this, new EdgeIndexControl(this, searchSetsService)); - } - public SearchIndexReader getSearchIndexReader() throws IOException { return new SearchIndexReader( getForwardIndexReader(), diff --git a/services-core/index-service/src/main/java/nu/marginalia/index/IndexTablesModule.java b/services-core/index-service/src/main/java/nu/marginalia/index/IndexTablesModule.java new file mode 100644 index 00000000..c26ca5e3 --- /dev/null +++ b/services-core/index-service/src/main/java/nu/marginalia/index/IndexTablesModule.java @@ -0,0 +1,20 @@ +package nu.marginalia.index; + +import com.google.inject.AbstractModule; +import com.google.inject.name.Names; +import nu.marginalia.WmsaHome; + +import java.nio.file.Path; + +public class IndexTablesModule extends AbstractModule { + + public void configure() { + bind(Path.class).annotatedWith(Names.named("partition-root-slow")).toInstance(WmsaHome.getDisk("index-write")); + bind(Path.class).annotatedWith(Names.named("partition-root-fast")).toInstance(WmsaHome.getDisk("index-read")); + + bind(Path.class).annotatedWith(Names.named("partition-root-slow-tmp")).toInstance(WmsaHome.getDisk("tmp-slow")); + bind(Path.class).annotatedWith(Names.named("tmp-file-dir")).toInstance(WmsaHome.getDisk("tmp-fast")); + + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/config/RankingSettings.java b/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettings.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/config/RankingSettings.java rename to services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettings.java index 1c71e544..a755a480 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/config/RankingSettings.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettings.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.config; +package nu.marginalia.index.config; import lombok.ToString; import org.yaml.snakeyaml.Yaml; @@ -6,7 +6,6 @@ import org.yaml.snakeyaml.Yaml; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.List; @ToString public class RankingSettings { diff --git a/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettingsEntry.java b/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettingsEntry.java new file mode 100644 index 00000000..7723e3ff --- /dev/null +++ b/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettingsEntry.java @@ -0,0 +1,11 @@ +package nu.marginalia.index.config; + +import java.util.List; + +public class RankingSettingsEntry { + /** Bias the ranking toward these domains */ + public List domains; + + /** Number of domains to include in ranking */ + public int max; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java b/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java rename to services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java index 72c0e13f..b5b4677c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java @@ -1,11 +1,14 @@ -package nu.marginalia.wmsa.edge.index.postings; +package nu.marginalia.index.index; -import nu.marginalia.wmsa.edge.index.EdgeIndexControl; -import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.query.IndexQuery; -import nu.marginalia.wmsa.edge.index.query.IndexQueryParams; -import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator; -import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepFromPredicate; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.index.IndexServicesFactory; +import nu.marginalia.index.query.IndexQuery; +import nu.marginalia.index.query.IndexQueryBuilder; +import nu.marginalia.index.results.IndexResultDomainDeduplicator; +import nu.marginalia.index.query.IndexQueryParams; +import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate; +import nu.marginalia.index.svc.IndexSearchSetsService; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,6 +20,7 @@ import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.LongPredicate; +@Singleton public class SearchIndex { private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -27,11 +31,12 @@ public class SearchIndex { @NotNull private final IndexServicesFactory servicesFactory; - private final EdgeIndexControl indexControl; + private IndexSearchSetsService searchSetsService; - public SearchIndex(@NotNull IndexServicesFactory servicesFactory, EdgeIndexControl indexControl) { + @Inject + public SearchIndex(@NotNull IndexServicesFactory servicesFactory, IndexSearchSetsService searchSetsService) { this.servicesFactory = servicesFactory; - this.indexControl = indexControl; + this.searchSetsService = searchSetsService; } public void init() { @@ -39,7 +44,7 @@ public class SearchIndex { try { lock.lock(); - logger.info("Initializing bucket"); + logger.info("Initializing index"); if (indexReader == null) { indexReader = servicesFactory.getSearchIndexReader(); @@ -55,13 +60,14 @@ public class SearchIndex { public boolean switchIndex() throws IOException { - indexControl.regenerateIndex(); + servicesFactory.convertIndex(searchSetsService.getDomainRankings()); + System.gc(); Lock lock = indexReplacementLock.writeLock(); try { lock.lock(); - indexControl.switchIndexFiles(); + servicesFactory.switchFilesJob().call(); indexReader = servicesFactory.getSearchIndexReader(); } @@ -80,7 +86,7 @@ public class SearchIndex { return indexReader != null; } - public IndexQuery getQuery(EdgeIndexQuerySearchTerms terms, IndexQueryParams params, LongPredicate includePred) { + public IndexQuery createQuery(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) { if (null == indexReader) { logger.warn("Index reader not ready"); @@ -89,7 +95,7 @@ public class SearchIndex { final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords); - SearchIndexReader.IndexQueryBuilder query = + IndexQueryBuilder query = switch(params.queryStrategy()) { case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes); case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT @@ -101,7 +107,7 @@ public class SearchIndex { return new IndexQuery(Collections.emptyList()); } - query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred)); + query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred)); for (int i = 0; i < orderedIncludes.length; i++) { query = query.also(orderedIncludes[i]); @@ -113,9 +119,9 @@ public class SearchIndex { // Run these last, as they'll worst-case cause as many page faults as there are // items in the buffer - query.addInclusionFilter(indexReader.filterForParams(params)); - - return query.build(); + return query + .addInclusionFilter(indexReader.filterForParams(params)) + .build(); } private int compareKeywords(int a, int b) { @@ -128,12 +134,6 @@ public class SearchIndex { public IndexQuery getDomainQuery(int wordId, IndexResultDomainDeduplicator localFilter) { throw new UnsupportedOperationException(""); // TBI - /* - var query = indexReader.findDomain(wordId); - - query.addInclusionFilter(new QueryFilterStepFromPredicate(localFilter::filterRawValue)); - - return query;*/ } /** Replaces the values of ids with their associated metadata, or 0L if absent */ diff --git a/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java b/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java new file mode 100644 index 00000000..89160aae --- /dev/null +++ b/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java @@ -0,0 +1,42 @@ +package nu.marginalia.index.index; + +import nu.marginalia.index.query.IndexQuery; +import nu.marginalia.index.query.IndexQueryBuilder; +import nu.marginalia.index.query.filter.QueryFilterStepIf; +import nu.marginalia.index.reverse.ReverseIndexReader; + +public class SearchIndexQueryBuilder implements IndexQueryBuilder { + private final IndexQuery query; + private final ReverseIndexReader reverseIndexReader; + + SearchIndexQueryBuilder(ReverseIndexReader reverseIndexReader, IndexQuery query) { + this.query = query; + this.reverseIndexReader = reverseIndexReader; + } + + public IndexQueryBuilder also(int termId) { + + query.addInclusionFilter(reverseIndexReader.also(termId)); + + return this; + } + + public IndexQueryBuilder not(int termId) { + + query.addInclusionFilter(reverseIndexReader.not(termId)); + + return this; + } + + public IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep) { + + query.addInclusionFilter(filterStep); + + return this; + } + + public IndexQuery build() { + return query; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexReader.java b/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java similarity index 60% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexReader.java rename to services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java index f19063ca..7836e92c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexReader.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java @@ -1,15 +1,16 @@ -package nu.marginalia.wmsa.edge.index.postings; +package nu.marginalia.index.index; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexReader; -import nu.marginalia.wmsa.edge.index.postings.forward.ParamMatchingQueryFilter; -import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader; -import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader; -import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySourceBehavior; -import nu.marginalia.wmsa.edge.index.query.EntrySource; -import nu.marginalia.wmsa.edge.index.query.IndexQuery; -import nu.marginalia.wmsa.edge.index.query.IndexQueryParams; -import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; +import nu.marginalia.index.forward.ForwardIndexReader; +import nu.marginalia.index.forward.ParamMatchingQueryFilter; +import nu.marginalia.index.query.EntrySource; +import nu.marginalia.index.query.IndexQuery; +import nu.marginalia.index.query.IndexQueryBuilder; +import nu.marginalia.index.query.IndexQueryParams; +import nu.marginalia.index.query.filter.QueryFilterStepIf; +import nu.marginalia.index.reverse.ReverseIndexPrioReader; +import nu.marginalia.index.reverse.ReverseIndexReader; +import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,7 +37,7 @@ public class SearchIndexReader { entrySources.add(reverseIndexReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_PREFER)); - return new IndexQueryBuilder(new IndexQuery(entrySources)); + return new SearchIndexQueryBuilder(reverseIndexReader, new IndexQuery(entrySources)); } public IndexQueryBuilder findWordAsTopic(int[] wordIdsByFrequency) { @@ -46,7 +47,7 @@ public class SearchIndexReader { entrySources.add(reverseIndexPrioReader.priorityDocuments(wordId)); } - return new IndexQueryBuilder(new IndexQuery(entrySources)); + return new SearchIndexQueryBuilder(reverseIndexReader, new IndexQuery(entrySources)); } public IndexQueryBuilder findWordTopicDynamicMode(int[] wordIdsByFrequency) { @@ -62,13 +63,13 @@ public class SearchIndexReader { entrySources.add(reverseIndexReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_NOT_PREFER)); - return new IndexQueryBuilder(new IndexQuery(entrySources)); + return new SearchIndexQueryBuilder(reverseIndexReader, new IndexQuery(entrySources)); } QueryFilterStepIf filterForParams(IndexQueryParams params) { return new ParamMatchingQueryFilter(params, forwardIndexReader); } - @SneakyThrows + public long numHits(int word) { return reverseIndexReader.numDocuments(word); } @@ -85,37 +86,4 @@ public class SearchIndexReader { return forwardIndexReader.getDomainId(docId); } - public class IndexQueryBuilder { - private final IndexQuery query; - - IndexQueryBuilder(IndexQuery query) { - this.query = query; - } - - public IndexQueryBuilder also(int termId) { - - query.addInclusionFilter(reverseIndexReader.also(termId)); - - return this; - } - - public IndexQueryBuilder not(int termId) { - - query.addInclusionFilter(reverseIndexReader.not(termId)); - - return this; - } - - public IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep) { - - query.addInclusionFilter(filterStep); - - return this; - } - - public IndexQuery build() { - return query; - } - - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/EdgeIndexQuerySearchTerms.java b/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java similarity index 76% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/EdgeIndexQuerySearchTerms.java rename to services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java index df4ff2c7..26fe36f5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/EdgeIndexQuerySearchTerms.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.edge.index.postings; +package nu.marginalia.index.index; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntComparator; import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; -public record EdgeIndexQuerySearchTerms(IntList includes, IntList excludes, IntList priority) { - public EdgeIndexQuerySearchTerms() { +public record SearchIndexSearchTerms(IntList includes, IntList excludes, IntList priority) { + public SearchIndexSearchTerms() { this(IntList.of(), IntList.of(), IntList.of()); } diff --git a/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java b/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java new file mode 100644 index 00000000..af6f100d --- /dev/null +++ b/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java @@ -0,0 +1,26 @@ +package nu.marginalia.index.results; + +import com.google.inject.Inject; +import nu.marginalia.index.index.SearchIndex; + +public class IndexMetadataService { + private final SearchIndex index; + + @Inject + public IndexMetadataService(SearchIndex index) { + this.index = index; + } + + public long getDocumentMetadata(long urlId) { + return index.getDocumentMetadata(urlId); + } + + public int getDomainId(long urlId) { + return index.getDomainId(urlId); + } + + public long[] getTermMetadata(int termId, long[] docIdsAll) { + return index.getTermMetadata(termId, docIdsAll); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexResultDomainDeduplicator.java b/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDomainDeduplicator.java similarity index 70% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexResultDomainDeduplicator.java rename to services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDomainDeduplicator.java index 40ed46fc..7f2b69b6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexResultDomainDeduplicator.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDomainDeduplicator.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.index.query; +package nu.marginalia.index.results; import gnu.trove.map.TLongIntMap; import gnu.trove.map.hash.TLongIntHashMap; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; +import nu.marginalia.index.client.model.results.EdgeSearchResultItem; public class IndexResultDomainDeduplicator { final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0); @@ -12,20 +12,6 @@ public class IndexResultDomainDeduplicator { this.limitByDomain = limitByDomain; } - public boolean filterRawValue(long value) { - int rankingId = (int) (value >>> 32); - - if (rankingId == Integer.MAX_VALUE) { - return true; - } - - return resultsByRankingId.get(getKey(rankingId)) <= limitByDomain; - } - - long getKey(int rankingId) { - return rankingId; - } - public boolean test(long value) { int ranking = (int) (value >>> 32); if (ranking == Integer.MAX_VALUE) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java b/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java rename to services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index da2e92f8..a6eb43bc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -1,23 +1,25 @@ -package nu.marginalia.wmsa.edge.index.postings; +package nu.marginalia.index.results; import gnu.trove.list.TLongList; import gnu.trove.map.hash.TObjectIntHashMap; import gnu.trove.set.hash.TLongHashSet; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; -import nu.marginalia.wmsa.edge.index.model.QueryStrategy; -import nu.marginalia.wmsa.edge.index.query.IndexQueryParams; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; +import nu.marginalia.index.svc.SearchTermsService; +import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.EdgePageWordMetadata; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.client.model.results.EdgeSearchResultItem; +import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore; +import nu.marginalia.index.client.model.query.EdgeSearchSubquery; +import nu.marginalia.index.query.IndexQueryParams; import java.util.List; -import java.util.Objects; +import java.util.OptionalInt; public class IndexResultValuator { private final IndexMetadataService metadataService; + private final SearchTermsService searchTermsSvc; private final List> searchTermVariants; private final IndexQueryParams queryParams; private final int[] termIdsAll; @@ -27,21 +29,23 @@ public class IndexResultValuator { private final TObjectIntHashMap termToId = new TObjectIntHashMap<>(10, 0.75f, -1); private final TermMetadata termMetadata; - public IndexResultValuator(SearchIndexControl indexes, TLongList results, List subqueries, IndexQueryParams queryParams) { - this.metadataService = new IndexMetadataService(indexes); + public IndexResultValuator(SearchTermsService searchTermsSvc, + IndexMetadataService metadataService, + TLongList results, + List subqueries, + IndexQueryParams queryParams) { + this.searchTermsSvc = searchTermsSvc; this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); this.queryParams = queryParams; + this.metadataService = metadataService; - var lexiconReader = Objects.requireNonNull(indexes.getLexiconReader()); IntArrayList termIdsList = new IntArrayList(); searchTermVariants.stream().flatMap(List::stream).distinct().forEach(term -> { - int id = lexiconReader.get(term); - - if (id >= 0) { + searchTermsSvc.lookUpWord(term).ifPresent(id -> { termIdsList.add(id); termToId.put(term, id); - } + }); }); final long[] resultsArray = results.toArray(); @@ -53,8 +57,9 @@ public class IndexResultValuator { subqueries.stream() .flatMap(sq -> sq.searchTermsPriority.stream()) .distinct() - .mapToInt(lexiconReader::get) - .filter(id -> id >= 0) + .map(searchTermsSvc::lookUpWord) + .filter(OptionalInt::isPresent) + .mapToInt(OptionalInt::getAsInt) .toArray(); resultsWithPriorityTerms = new TLongHashSet(results.size()); diff --git a/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java b/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java new file mode 100644 index 00000000..36561dc7 --- /dev/null +++ b/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java @@ -0,0 +1,77 @@ +package nu.marginalia.index.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.index.index.SearchIndex; +import spark.Request; +import spark.Response; +import spark.Spark; + +import javax.annotation.CheckReturnValue; +import java.util.Optional; +import java.util.concurrent.Callable; +import java.util.concurrent.locks.ReentrantLock; + +@Singleton +public class IndexOpsService { + private final ReentrantLock opsLock = new ReentrantLock(); + + private final SearchIndex index; + private final IndexSearchSetsService searchSetService; + + @Inject + public IndexOpsService(SearchIndex index, + IndexSearchSetsService searchSetService) { + this.index = index; + this.searchSetService = searchSetService; + } + + public boolean isBusy() { + return opsLock.isLocked(); + } + + public Object repartitionEndpoint(Request request, Response response) throws Exception { + + if (!run(searchSetService::recalculateAll)) { + Spark.halt(503, "Operations busy"); + } + return "OK"; + } + + public Object reindexEndpoint(Request request, Response response) throws Exception { + if (!run(index::switchIndex).isPresent()) { + Spark.halt(503, "Operations busy"); + } + return "OK"; + } + + + + @CheckReturnValue + private Optional run(Callable c) throws Exception { + if (!opsLock.tryLock()) + return Optional.empty(); + try { + return Optional.of(c.call()); + } + finally { + opsLock.unlock(); + } + } + + + @CheckReturnValue + private boolean run(Runnable r) { + if (!opsLock.tryLock()) + return false; + try { + r.run(); + return true; + } + finally { + opsLock.unlock(); + } + } + +} + diff --git a/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java new file mode 100644 index 00000000..6a46554c --- /dev/null +++ b/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -0,0 +1,294 @@ +package nu.marginalia.index.svc; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import gnu.trove.list.TLongList; +import gnu.trove.list.array.TLongArrayList; +import gnu.trove.set.hash.TLongHashSet; +import io.prometheus.client.Counter; +import io.prometheus.client.Gauge; +import io.prometheus.client.Histogram; +import nu.marginalia.index.client.model.results.EdgeSearchResultItem; +import nu.marginalia.index.client.model.results.EdgeSearchResultSet; +import nu.marginalia.index.client.model.query.EdgeSearchSpecification; +import nu.marginalia.index.client.model.query.EdgeSearchSubquery; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.index.index.SearchIndex; +import nu.marginalia.index.index.SearchIndexSearchTerms; +import nu.marginalia.index.results.IndexMetadataService; +import nu.marginalia.index.searchset.SearchSet; +import nu.marginalia.index.results.IndexResultValuator; +import nu.marginalia.index.query.IndexQuery; +import nu.marginalia.index.results.IndexResultDomainDeduplicator; +import nu.marginalia.index.query.IndexQueryParams; +import nu.marginalia.index.query.IndexSearchBudget; +import nu.marginalia.index.svc.searchset.SmallSearchSet; +import nu.marginalia.model.gson.GsonFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.slf4j.Marker; +import org.slf4j.MarkerFactory; +import spark.HaltException; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.util.ArrayList; +import java.util.List; + +import static java.util.Comparator.comparingDouble; + +@Singleton +public class IndexQueryService { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final Marker queryMarker = MarkerFactory.getMarker("QUERY"); + + + private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register(); + private static final Gauge wmsa_edge_index_query_cost = Gauge.build().name("wmsa_edge_index_query_cost").help("-").register(); + private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register(); + + private final Gson gson = GsonFactory.get(); + + private final SearchIndex index; + private final IndexSearchSetsService searchSetsService; + + private final IndexMetadataService metadataService; + private final SearchTermsService searchTermsSvc; + + + @Inject + public IndexQueryService(SearchIndex index, + IndexSearchSetsService searchSetsService, + IndexMetadataService metadataService, + SearchTermsService searchTerms) { + this.index = index; + this.searchSetsService = searchSetsService; + this.metadataService = metadataService; + this.searchTermsSvc = searchTerms; + } + + public Object search(Request request, Response response) { + String json = request.body(); + EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class); + + try { + return wmsa_edge_index_query_time.time(() -> { + var params = new SearchParameters(specsSet, getSearchSet(specsSet)); + + List results = executeSearch(params); + logger.info(queryMarker, "Index Result Count: {}", results.size()); + + wmsa_edge_index_query_cost.set(params.getDataCost()); + if (!params.hasTimeLeft()) { + wmsa_edge_index_query_timeouts.inc(); + } + + return new EdgeSearchResultSet(results); + }); + } + catch (HaltException ex) { + logger.warn("Halt", ex); + throw ex; + } + catch (Exception ex) { + logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json); + logger.info("Error", ex); + Spark.halt(500, "Error"); + return null; + } + } + + private SearchSet getSearchSet(EdgeSearchSpecification specsSet) { + if (specsSet.domains != null && !specsSet.domains.isEmpty()) { + return new SmallSearchSet(specsSet.domains); + } + + return searchSetsService.getSearchSetByName(specsSet.searchSetIdentifier); + } + + private List executeSearch(SearchParameters params) { + var resultIds = evaluateSubqueries(params); + + var resultItems = calculateResultScores(params, resultIds); + + return selectBestResults(params, resultItems); + } + + private TLongList evaluateSubqueries(SearchParameters params) { + final TLongList results = new TLongArrayList(params.fetchSize); + + for (var sq : params.subqueries) { + final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(sq); + + if (searchTerms.isEmpty()) { + continue; + } + + results.addAll( + executeSubquery(searchTerms, params) + ); + + if (!params.hasTimeLeft()) { + logger.info("Query timed out {}, ({}), -{}", + sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude); + break; + } + } + + return results; + } + + private TLongArrayList executeSubquery(SearchIndexSearchTerms terms, SearchParameters params) + { + final TLongArrayList results = new TLongArrayList(params.fetchSize); + final LongQueryBuffer buffer = new LongQueryBuffer(params.fetchSize); + + IndexQuery query = params.createIndexQuery(index, terms); + + while (query.hasMore() + && results.size() < params.fetchSize + && params.budget.hasTimeLeft()) + { + buffer.reset(); + query.getMoreResults(buffer); + + for (int i = 0; i < buffer.size() && results.size() < params.fetchSize; i++) { + results.add(buffer.data[i]); + } + } + + params.dataCost += query.dataCost(); + + return results; + } + + private ArrayList calculateResultScores(SearchParameters params, TLongList results) { + + final var evaluator = new IndexResultValuator( + searchTermsSvc, + metadataService, + results, + params.subqueries, + params.queryParams); + + ArrayList items = new ArrayList<>(results.size()); + ArrayList refusedItems = new ArrayList<>(results.size()); + + // Sorting the result ids results in better paging characteristics + results.sort(); + + results.forEach(id -> { + var item = evaluator.evaluateResult(id); + + // Score value is zero when the best params variant consists of low-value terms that are just scattered + // throughout the document, with no indicators of importance associated with them. + if (item.getScoreValue() < 0) { + items.add(item); + } + else { + refusedItems.add(item); + } + + return true; + }); + + if (items.isEmpty()) { + items.addAll(refusedItems); + } + + return items; + } + + private List selectBestResults(SearchParameters params, List results) { + + var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain); + + results.sort(comparingDouble(EdgeSearchResultItem::getScore) + .thenComparingInt(EdgeSearchResultItem::getRanking) + .thenComparingInt(EdgeSearchResultItem::getUrlIdInt)); + + List resultsList = new ArrayList<>(results.size()); + + for (var item : results) { + if (domainCountFilter.test(item)) { + resultsList.add(item); + } + } + + if (resultsList.size() > params.limitTotal) { + // This can't be made a stream limit() operation because we need domainCountFilter + // to run over the entire list to provide accurate statistics + + resultsList.subList(params.limitTotal, resultsList.size()).clear(); + } + + // populate results with the total number of results encountered from + // the same domain so this information can be presented to the user + for (var result : resultsList) { + result.resultsFromDomain = domainCountFilter.getCount(result); + } + + return resultsList; + } + +} + +class SearchParameters { + /** This is how many results matching the keywords we'll try to get + before evaluating them for the best result. */ + final int fetchSize; + final IndexSearchBudget budget; + final List subqueries; + final IndexQueryParams queryParams; + + final int limitByDomain; + final int limitTotal; + + // mutable: + + /** An estimate of how much data has been read */ + long dataCost = 0; + + /** A set of id:s considered during each subquery, + * for deduplication + */ + final TLongHashSet consideredUrlIds; + + public SearchParameters(EdgeSearchSpecification specsSet, SearchSet searchSet) { + var limits = specsSet.queryLimits; + + this.fetchSize = limits.fetchSize(); + this.budget = new IndexSearchBudget(limits.timeoutMs()); + this.subqueries = specsSet.subqueries; + this.limitByDomain = limits.resultsByDomain(); + this.limitTotal = limits.resultsTotal(); + + this.consideredUrlIds = new TLongHashSet(fetchSize * 4); + + queryParams = new IndexQueryParams( + specsSet.quality, + specsSet.year, + specsSet.size, + specsSet.rank, + searchSet, + specsSet.queryStrategy); + } + + IndexQuery createIndexQuery(SearchIndex index, SearchIndexSearchTerms terms) { + return index.createQuery(terms, queryParams, consideredUrlIds::add); + } + + boolean hasTimeLeft() { + return budget.hasTimeLeft(); + } + + long getDataCost() { + return dataCost; + } + +} + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java b/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java similarity index 76% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java rename to services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java index 5e9a3114..68012330 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java @@ -1,28 +1,28 @@ -package nu.marginalia.wmsa.edge.index.svc; +package nu.marginalia.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank; -import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; -import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; -import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator; -import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.config.RankingSettings; -import nu.marginalia.wmsa.edge.index.postings.DomainRankings; -import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; -import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet; -import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet; -import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny; -import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; +import nu.marginalia.index.IndexServicesFactory; +import nu.marginalia.index.searchset.SearchSet; +import nu.marginalia.ranking.ReversePageRank; +import nu.marginalia.ranking.StandardPageRank; +import nu.marginalia.ranking.accumulator.RankingResultBitSetAccumulator; +import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator; +import nu.marginalia.ranking.data.RankingDomainFetcher; +import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; +import nu.marginalia.index.svc.searchset.RankingSearchSet; +import nu.marginalia.index.svc.searchset.SearchSetAny; +import nu.marginalia.index.config.RankingSettings; +import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.index.client.model.query.SearchSetIdentifier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; @Singleton -public class EdgeIndexSearchSetsService { +public class IndexSearchSetsService { private final Logger logger = LoggerFactory.getLogger(getClass()); private final RankingDomainFetcher rankingDomains; private final RankingDomainFetcher similarityDomains; @@ -39,10 +39,10 @@ public class EdgeIndexSearchSetsService { private volatile DomainRankings domainRankings = new DomainRankings(); @Inject - public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains, - RankingDomainFetcherForSimilarityData similarityDomains, - RankingSettings rankingSettings, - IndexServicesFactory servicesFactory) throws IOException { + public IndexSearchSetsService(RankingDomainFetcher rankingDomains, + RankingDomainFetcherForSimilarityData similarityDomains, + RankingSettings rankingSettings, + IndexServicesFactory servicesFactory) throws IOException { this.rankingDomains = rankingDomains; diff --git a/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java b/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java new file mode 100644 index 00000000..0ac7cced --- /dev/null +++ b/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java @@ -0,0 +1,67 @@ +package nu.marginalia.index.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.dict.OffHeapDictionaryHashMap; +import nu.marginalia.index.client.model.query.EdgeSearchSubquery; +import nu.marginalia.index.index.SearchIndexSearchTerms; +import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.OptionalInt; + +@Singleton +public class SearchTermsService { + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final KeywordLexiconReadOnlyView lexicon; + + @Inject + public SearchTermsService(KeywordLexiconReadOnlyView lexicon) { + this.lexicon = lexicon; + } + + public SearchIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) { + final IntList excludes = new IntArrayList(); + final IntList includes = new IntArrayList(); + final IntList priority = new IntArrayList(); + + for (var include : request.searchTermsInclude) { + var word = lookUpWord(include); + if (word.isEmpty()) { + logger.info("Unknown search term: " + include); + return new SearchIndexSearchTerms(); + } + includes.add(word.getAsInt()); + } + + for (var advice : request.searchTermsAdvice) { + var word = lookUpWord(advice); + if (word.isEmpty()) { + logger.info("Unknown search term: " + advice); + return new SearchIndexSearchTerms(); + } + includes.add(word.getAsInt()); + } + + for (var exclude : request.searchTermsExclude) { + lookUpWord(exclude).ifPresent(excludes::add); + } + for (var exclude : request.searchTermsPriority) { + lookUpWord(exclude).ifPresent(priority::add); + } + + return new SearchIndexSearchTerms(includes, excludes, priority); + } + + + public OptionalInt lookUpWord(String s) { + int ret = lexicon.get(s); + if (ret == OffHeapDictionaryHashMap.NO_VALUE) { + return OptionalInt.empty(); + } + return OptionalInt.of(ret); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSet.java b/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java similarity index 79% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSet.java rename to services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java index 7ce90c73..22ceab15 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSet.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java @@ -1,6 +1,10 @@ -package nu.marginalia.wmsa.edge.index.svc.searchset; +package nu.marginalia.index.svc.searchset; +import nu.marginalia.index.client.model.query.SearchSetIdentifier; +import nu.marginalia.index.searchset.SearchSet; import org.roaringbitmap.RoaringBitmap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.DataInputStream; import java.io.DataOutputStream; @@ -16,6 +20,7 @@ import java.nio.file.StandardOpenOption; * */ public class RankingSearchSet implements SearchSet { + private final Logger logger = LoggerFactory.getLogger(getClass()); private final RoaringBitmap set; public final SearchSetIdentifier identifier; public final Path source; @@ -36,6 +41,10 @@ public class RankingSearchSet implements SearchSet { else { set = load(source); } + + if (set.isEmpty()) { + logger.warn("Search set {} is empty", identifier); + } } private static RoaringBitmap load(Path source) throws IOException { @@ -53,7 +62,9 @@ public class RankingSearchSet implements SearchSet { @Override public boolean contains(int urlId) { - return set.contains(urlId); + // Fallback on allow-all if no items are in set + + return set.contains(urlId) || set.isEmpty(); } public void write() throws IOException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetAny.java b/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java similarity index 71% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetAny.java rename to services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java index dabebb8a..63b433ac 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetAny.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java @@ -1,4 +1,6 @@ -package nu.marginalia.wmsa.edge.index.svc.searchset; +package nu.marginalia.index.svc.searchset; + +import nu.marginalia.index.searchset.SearchSet; public class SearchSetAny implements SearchSet { @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SmallSearchSet.java b/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SmallSearchSet.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SmallSearchSet.java rename to services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SmallSearchSet.java index 8f1e8e9a..8d261df8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SmallSearchSet.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SmallSearchSet.java @@ -1,6 +1,7 @@ -package nu.marginalia.wmsa.edge.index.svc.searchset; +package nu.marginalia.index.svc.searchset; import gnu.trove.set.hash.TIntHashSet; +import nu.marginalia.index.searchset.SearchSet; import java.util.Arrays; import java.util.Collection; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java b/services-core/index-service/src/test/java/nu/marginalia/index/model/EdgePageDocumentsMetadataTest.java similarity index 94% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java rename to services-core/index-service/src/test/java/nu/marginalia/index/model/EdgePageDocumentsMetadataTest.java index a3552a85..0bfe142f 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java +++ b/services-core/index-service/src/test/java/nu/marginalia/index/model/EdgePageDocumentsMetadataTest.java @@ -1,6 +1,8 @@ -package nu.marginalia.wmsa.edge.index.model; +package nu.marginalia.index.model; +import nu.marginalia.model.crawl.EdgePageDocumentFlags; +import nu.marginalia.model.idx.EdgePageDocumentsMetadata; import org.junit.jupiter.api.Test; import java.util.EnumSet; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/RankingSettingsTest.java b/services-core/index-service/src/test/java/nu/marginalia/index/model/RankingSettingsTest.java similarity index 93% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/RankingSettingsTest.java rename to services-core/index-service/src/test/java/nu/marginalia/index/model/RankingSettingsTest.java index a8a0c17f..f49d13d8 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/RankingSettingsTest.java +++ b/services-core/index-service/src/test/java/nu/marginalia/index/model/RankingSettingsTest.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.index.model; +package nu.marginalia.index.model; -import nu.marginalia.wmsa.edge.index.config.RankingSettings; +import nu.marginalia.index.config.RankingSettings; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -10,7 +10,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.List; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; class RankingSettingsTest { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java b/services-core/index-service/src/test/java/nu/marginalia/index/service/EdgeIndexIntegrationTest.java similarity index 76% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java rename to services-core/index-service/src/test/java/nu/marginalia/index/service/EdgeIndexIntegrationTest.java index 0ef29e64..01af6363 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java +++ b/services-core/index-service/src/test/java/nu/marginalia/index/service/EdgeIndexIntegrationTest.java @@ -1,38 +1,12 @@ -package nu.marginalia.wmsa.edge.index.service; +package nu.marginalia.index.service; -import com.google.inject.Guice; -import com.google.inject.Inject; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.edge.index.model.*; -import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService; -import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; -import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; import org.junit.jupiter.api.parallel.Execution; -import spark.Spark; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.EnumSet; -import java.util.List; -import java.util.stream.IntStream; import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @Execution(SAME_THREAD) public class EdgeIndexIntegrationTest { +/* FIXME @Inject Initialization initialization; @@ -164,7 +138,7 @@ public class EdgeIndexIntegrationTest { long fullId = id | ((long) (32 - (id % 32)) << 32); - var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); + var header = new IndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { @@ -172,12 +146,12 @@ public class EdgeIndexIntegrationTest { data[2*i + 1] = new EdgePageWordMetadata(i, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); } - lexiconService.putWords(0, header, new SearchIndexJournalEntry(data)); + lexiconService.putWords(0, header, new IndexJournalEntryData(data)); } public void loadDataWithDomain(int domain, int id) { int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); - var header = new SearchIndexJournalEntryHeader(factors.length, id | ((long) domain << 32), EdgePageDocumentsMetadata.defaultValue()); + var header = new IndexJournalEntryHeader(factors.length, id | ((long) domain << 32), EdgePageDocumentsMetadata.defaultValue()); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { @@ -185,7 +159,7 @@ public class EdgeIndexIntegrationTest { data[2*i + 1] = new EdgePageWordMetadata(i % 20, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); } - lexiconService.putWords(0, header, new SearchIndexJournalEntry(data)); + lexiconService.putWords(0, header, new IndexJournalEntryData(data)); } - +*/ } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java b/services-core/index-service/src/test/java/nu/marginalia/index/service/EdgeIndexIntegrationTestModule.java similarity index 73% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java rename to services-core/index-service/src/test/java/nu/marginalia/index/service/EdgeIndexIntegrationTestModule.java index aaa44c35..bf816da3 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java +++ b/services-core/index-service/src/test/java/nu/marginalia/index/service/EdgeIndexIntegrationTestModule.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.service; +package nu.marginalia.index.service; import com.google.inject.AbstractModule; import com.google.inject.name.Names; -import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.postings.DomainRankings; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; -import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny; -import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; +import nu.marginalia.index.IndexServicesFactory; +import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.index.svc.IndexSearchSetsService; +import nu.marginalia.index.svc.searchset.SearchSetAny; +import nu.marginalia.index.util.TestUtil; +import nu.marginalia.index.client.model.query.SearchSetIdentifier; import org.mockito.Mockito; import java.io.IOException; @@ -46,11 +46,11 @@ public class EdgeIndexIntegrationTestModule extends AbstractModule { slowDir, fastDir )); - EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class); + IndexSearchSetsService setsServiceMock = Mockito.mock(IndexSearchSetsService.class); when(setsServiceMock.getSearchSetByName(SearchSetIdentifier.NONE)).thenReturn(new SearchSetAny()); when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings()); - bind(EdgeIndexSearchSetsService.class).toInstance(setsServiceMock); + bind(IndexSearchSetsService.class).toInstance(setsServiceMock); bind(String.class).annotatedWith(Names.named("service-host")).toInstance("127.0.0.1"); bind(Integer.class).annotatedWith(Names.named("service-port")).toProvider(this::randomPort); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryDataTest.java b/services-core/index-service/src/test/java/nu/marginalia/index/service/util/DictionaryDataTest.java similarity index 91% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryDataTest.java rename to services-core/index-service/src/test/java/nu/marginalia/index/service/util/DictionaryDataTest.java index a88715a2..92f134cf 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryDataTest.java +++ b/services-core/index-service/src/test/java/nu/marginalia/index/service/util/DictionaryDataTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.util; +package nu.marginalia.index.service.util; class DictionaryDataTest { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryHashMapTest.java b/services-core/index-service/src/test/java/nu/marginalia/index/service/util/DictionaryHashMapTest.java similarity index 96% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryHashMapTest.java rename to services-core/index-service/src/test/java/nu/marginalia/index/service/util/DictionaryHashMapTest.java index c39d5c03..a290f33f 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryHashMapTest.java +++ b/services-core/index-service/src/test/java/nu/marginalia/index/service/util/DictionaryHashMapTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.util; +package nu.marginalia.index.service.util; class DictionaryHashMapTest { // diff --git a/services-core/index-service/src/test/java/nu/marginalia/index/service/util/PrimeUtilTest.java b/services-core/index-service/src/test/java/nu/marginalia/index/service/util/PrimeUtilTest.java new file mode 100644 index 00000000..c11d9719 --- /dev/null +++ b/services-core/index-service/src/test/java/nu/marginalia/index/service/util/PrimeUtilTest.java @@ -0,0 +1,30 @@ +package nu.marginalia.index.service.util; + +import nu.marginalia.util.PrimeUtil; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class PrimeUtilTest { + + @Test + void isPrime() { + Assertions.assertTrue(PrimeUtil.isPrime(1)); + Assertions.assertTrue(PrimeUtil.isPrime(2)); + Assertions.assertTrue(PrimeUtil.isPrime(3)); + Assertions.assertFalse(PrimeUtil.isPrime(4)); + Assertions.assertTrue(PrimeUtil.isPrime(5)); + Assertions.assertFalse(PrimeUtil.isPrime(6)); + Assertions.assertTrue(PrimeUtil.isPrime(7)); + Assertions.assertFalse(PrimeUtil.isPrime(8)); + Assertions.assertFalse(PrimeUtil.isPrime(9)); + Assertions.assertFalse(PrimeUtil.isPrime(10)); + Assertions.assertTrue(PrimeUtil.isPrime(11)); + } + + @Test + void nextPrime() { + System.out.println(PrimeUtil.nextPrime(1L<<31, -1)); + System.out.println(PrimeUtil.nextPrime(1L<<31, 1)); + + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java b/services-core/index-service/src/test/java/nu/marginalia/index/service/util/RandomWriteFunnelTest.java similarity index 98% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java rename to services-core/index-service/src/test/java/nu/marginalia/index/service/util/RandomWriteFunnelTest.java index deb5c992..c2f907bb 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java +++ b/services-core/index-service/src/test/java/nu/marginalia/index/service/util/RandomWriteFunnelTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.util; +package nu.marginalia.index.service.util; import nu.marginalia.util.RandomWriteFunnel; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSetTest.java b/services-core/index-service/src/test/java/nu/marginalia/index/svc/searchset/RankingSearchSetTest.java similarity index 83% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSetTest.java rename to services-core/index-service/src/test/java/nu/marginalia/index/svc/searchset/RankingSearchSetTest.java index effa7a1f..214596d3 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSetTest.java +++ b/services-core/index-service/src/test/java/nu/marginalia/index/svc/searchset/RankingSearchSetTest.java @@ -1,5 +1,6 @@ -package nu.marginalia.wmsa.edge.index.svc.searchset; +package nu.marginalia.index.svc.searchset; +import nu.marginalia.index.client.model.query.SearchSetIdentifier; import org.junit.jupiter.api.Test; import org.roaringbitmap.RoaringBitmap; @@ -7,7 +8,7 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertTrue; class RankingSearchSetTest { diff --git a/services-core/index-service/src/test/java/nu/marginalia/index/util/TestUtil.java b/services-core/index-service/src/test/java/nu/marginalia/index/util/TestUtil.java new file mode 100644 index 00000000..ef80181e --- /dev/null +++ b/services-core/index-service/src/test/java/nu/marginalia/index/util/TestUtil.java @@ -0,0 +1,50 @@ +package nu.marginalia.index.util; + + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; + +public class TestUtil { + private static boolean isTempDir(Path dir) { + return dir.startsWith("/tmp") || dir.toString().contains("tmp"); + } + + public static void clearTempDir(Path dir) { + if (!isTempDir(dir)) { + throw new IllegalArgumentException("Refusing to recursively delete directory with that name"); + } + if (Files.isDirectory(dir)) { + for (File f : dir.toFile().listFiles()) { + File[] files = f.listFiles(); + if (files != null) { + Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); + } + System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); + f.delete(); + } + } + System.out.println("Deleting " + dir); + dir.toFile().delete(); + } + + private static String fileSize(Path path) { + try { + long sizeBytes = Files.size(path); + + if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; + if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; + if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; + return sizeBytes + "b"; + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + private static String round(double d) { + return String.format("%.2f", d); + } +} diff --git a/services-core/readme.md b/services-core/readme.md new file mode 100644 index 00000000..1af591d3 --- /dev/null +++ b/services-core/readme.md @@ -0,0 +1,12 @@ +# Core Services + +The cores services constitute the main functionality of the search engine. + +* The [search-service](search-service/) parses queries, interrogates the index-service, + and decorates search results with metadata from the database. + +* The [index-service](index-service/) contains the indexes, it answers questions about + which documents contain which terms. + +* The [assistant-service](assistant-service/) helps the search service with spelling + suggestions other peripheral functionality. \ No newline at end of file diff --git a/services-core/search-service/build.gradle b/services-core/search-service/build.gradle new file mode 100644 index 00000000..816f2066 --- /dev/null +++ b/services-core/search-service/build.gradle @@ -0,0 +1,76 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'com.palantir.docker' version '0.34.0' + id 'application' + id 'jvm-test-suite' +} + +application { + mainClass = 'nu.marginalia.search.SearchMain' + applicationName = 'search-service' +} + +apply from: "$rootProject.projectDir/docker-service.gradle" + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:service') + implementation project(':common:config') + implementation project(':index:index-query') + + implementation project(':libraries:misc') + implementation project(':libraries:language-processing') + + implementation project(':api:assistant-api') + implementation project(':api:index-api') + implementation project(':api:search-api') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + + implementation project(':features:renderer') + implementation project(':features:screenshots') + implementation project(':features:random-websites') + implementation project(':features:query-parser') + + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guice + implementation libs.rxjava + implementation libs.spark + implementation libs.opencsv + implementation libs.trove + implementation libs.fastutil + implementation libs.bundles.gson + implementation libs.bundles.mariadb + implementation libs.bundles.nlp + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} +test { + maxHeapSize = "8G" + useJUnitPlatform() +} + +task fastTests(type: Test) { + maxHeapSize = "8G" + useJUnitPlatform { + excludeTags "slow" + } +} + diff --git a/services-core/search-service/src/main/java/nu/marginalia/search/SearchMain.java b/services-core/search-service/src/main/java/nu/marginalia/search/SearchMain.java new file mode 100644 index 00000000..f652352a --- /dev/null +++ b/services-core/search-service/src/main/java/nu/marginalia/search/SearchMain.java @@ -0,0 +1,38 @@ +package nu.marginalia.search; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.SearchServiceDescriptors; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.server.Initialization; +import spark.Spark; + +public class SearchMain extends MainClass { + private final SearchService service; + + @Inject + public SearchMain(SearchService service) { + this.service = service; + } + + public static void main(String... args) { + + init(ServiceId.Search, args); + + Spark.staticFileLocation("/static/search/"); + + Injector injector = Guice.createInjector( + new SearchModule(), + new ConfigurationModule(SearchServiceDescriptors.descriptors, ServiceId.Search), + new DatabaseModule() + ); + + injector.getInstance(SearchMain.class); + injector.getInstance(Initialization.class).setReady(); + + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchModule.java b/services-core/search-service/src/main/java/nu/marginalia/search/SearchModule.java similarity index 53% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchModule.java rename to services-core/search-service/src/main/java/nu/marginalia/search/SearchModule.java index 9db18272..1492c99f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchModule.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/SearchModule.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.search; +package nu.marginalia.search; import com.google.inject.AbstractModule; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.configuration.WebsiteUrl; -import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.LanguageModels; +import nu.marginalia.WebsiteUrl; +import nu.marginalia.WmsaHome; -public class EdgeSearchModule extends AbstractModule { +public class SearchModule extends AbstractModule { public void configure() { bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/services-core/search-service/src/main/java/nu/marginalia/search/SearchOperator.java similarity index 53% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java rename to services-core/search-service/src/main/java/nu/marginalia/search/SearchOperator.java index 64c8346a..982264e8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/SearchOperator.java @@ -1,27 +1,25 @@ -package nu.marginalia.wmsa.edge.search; +package nu.marginalia.search; import com.google.inject.Inject; import com.google.inject.Singleton; import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.assistant.client.AssistantClient; -import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; -import nu.marginalia.wmsa.edge.search.model.BrowseResult; -import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults; -import nu.marginalia.wmsa.edge.search.query.QueryFactory; -import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; -import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; -import nu.marginalia.wmsa.edge.search.svc.EdgeSearchDomainSearchService; -import nu.marginalia.wmsa.edge.search.svc.EdgeSearchQueryIndexService; -import nu.marginalia.wmsa.edge.search.svc.EdgeSearchUnitConversionService; -import nu.marginalia.wmsa.edge.search.svc.EdgeSearchWikiArticlesService; +import nu.marginalia.assistant.client.AssistantClient; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.dbcommon.DbDomainQueries; +import nu.marginalia.search.model.UrlDetails; +import nu.marginalia.client.Context; +import nu.marginalia.search.model.DecoratedSearchResults; +import nu.marginalia.search.query.QueryFactory; +import nu.marginalia.search.query.model.SearchQuery; +import nu.marginalia.search.query.model.UserSearchParameters; +import nu.marginalia.search.svc.SearchQueryIndexService; +import nu.marginalia.search.svc.SearchUnitConversionService; import org.apache.logging.log4j.util.Strings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.slf4j.Marker; +import org.slf4j.MarkerFactory; import javax.annotation.Nullable; import java.util.*; @@ -30,71 +28,65 @@ import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; @Singleton -public class EdgeSearchOperator { +public class SearchOperator { + + private static final Logger logger = LoggerFactory.getLogger(SearchOperator.class); + + // Marker for filtering out sensitive content from the persistent logs + private final Marker queryMarker = MarkerFactory.getMarker("QUERY"); - private static final Logger logger = LoggerFactory.getLogger(EdgeSearchOperator.class); private final AssistantClient assistantClient; - private final EdgeDataStoreDao edgeDataStoreDao; + private final DbDomainQueries domainQueries; private final QueryFactory queryFactory; - private final EdgeSearchQueryIndexService searchQueryService; - private final EdgeSearchDomainSearchService domainSearchService; - private final EdgeSearchWikiArticlesService wikiArticlesService; - private final EdgeSearchUnitConversionService edgeSearchUnitConversionService; + private final SearchQueryIndexService searchQueryService; + private final SearchUnitConversionService searchUnitConversionService; @Inject - public EdgeSearchOperator(AssistantClient assistantClient, - EdgeDataStoreDao edgeDataStoreDao, - QueryFactory queryFactory, - - EdgeSearchQueryIndexService searchQueryService, - EdgeSearchDomainSearchService domainSearchService, - EdgeSearchWikiArticlesService wikiArticlesService, - EdgeSearchUnitConversionService edgeSearchUnitConversionService) { + public SearchOperator(AssistantClient assistantClient, + DbDomainQueries domainQueries, + QueryFactory queryFactory, + SearchQueryIndexService searchQueryService, + SearchUnitConversionService searchUnitConversionService) { this.assistantClient = assistantClient; - this.edgeDataStoreDao = edgeDataStoreDao; + this.domainQueries = domainQueries; this.queryFactory = queryFactory; this.searchQueryService = searchQueryService; - this.domainSearchService = domainSearchService; - this.wikiArticlesService = wikiArticlesService; - this.edgeSearchUnitConversionService = edgeSearchUnitConversionService; + this.searchUnitConversionService = searchUnitConversionService; } - public List doApiSearch(Context ctx, - EdgeUserSearchParameters params) { + public List doApiSearch(Context ctx, + UserSearchParameters params) { - EdgeSearchQuery processedQuery = queryFactory.createQuery(params); + SearchQuery processedQuery = queryFactory.createQuery(params); - logger.info("Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ',')); + logger.info(queryMarker, "Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ',')); return searchQueryService.performQuery(ctx, processedQuery); } - public DecoratedSearchResults doSearch(Context ctx, EdgeUserSearchParameters params) { + public DecoratedSearchResults doSearch(Context ctx, UserSearchParameters params) { - Future definitions = wikiArticlesService.getWikiArticle(ctx, params.humanQuery()); - Future eval = edgeSearchUnitConversionService.tryEval(ctx, params.humanQuery()); - EdgeSearchQuery processedQuery = queryFactory.createQuery(params); + Future eval = searchUnitConversionService.tryEval(ctx, params.humanQuery()); + SearchQuery processedQuery = queryFactory.createQuery(params); - logger.info("Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ',')); + logger.info(queryMarker, "Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ',')); - List queryResults = searchQueryService.performQuery(ctx, processedQuery); - List domainResults = domainSearchService.getDomainResults(ctx, processedQuery.specs); + List queryResults = searchQueryService.performQuery(ctx, processedQuery); + + logger.info(queryMarker, "Search Result Count: {}", queryResults.size()); String evalResult = getFutureOrDefault(eval, ""); - WikiArticles wikiArticles = getFutureOrDefault(definitions, new WikiArticles()); return DecoratedSearchResults.builder() .params(params) .problems(getProblems(ctx, evalResult, queryResults, processedQuery)) .evalResult(evalResult) - .wiki(wikiArticles) .results(queryResults) - .domainResults(domainResults) .focusDomain(processedQuery.domain) .focusDomainId(getDomainId(processedQuery.domain)) .build(); @@ -117,7 +109,7 @@ public class EdgeSearchOperator { int domainId = -1; try { if (domain != null) { - return edgeDataStoreDao.getDomainId(new EdgeDomain(domain)).id(); + return domainQueries.getDomainId(new EdgeDomain(domain)).id(); } } catch (NoSuchElementException ex) { @@ -126,7 +118,7 @@ public class EdgeSearchOperator { return domainId; } - private List getProblems(Context ctx, String evalResult, List queryResults, EdgeSearchQuery processedQuery) { + private List getProblems(Context ctx, String evalResult, List queryResults, SearchQuery processedQuery) { final List problems = new ArrayList<>(processedQuery.problems); boolean siteSearch = processedQuery.domain != null; @@ -150,7 +142,7 @@ public class EdgeSearchOperator { } - private Iterable spellCheckTerms(Context ctx, EdgeSearchQuery disjointedQuery) { + private Iterable spellCheckTerms(Context ctx, SearchQuery disjointedQuery) { return Observable.fromIterable(disjointedQuery.searchTermsHuman) .subscribeOn(Schedulers.io()) .flatMap(term -> assistantClient.spellCheck(ctx, term) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java b/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java similarity index 61% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java rename to services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java index 6483c922..ecb21502 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java @@ -1,18 +1,18 @@ -package nu.marginalia.wmsa.edge.search; +package nu.marginalia.search; import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.name.Named; import lombok.SneakyThrows; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.WebsiteUrl; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.edge.search.command.IndexCommand; -import nu.marginalia.wmsa.edge.search.svc.*; -import nu.marginalia.wmsa.resource_store.StaticResources; +import nu.marginalia.WebsiteUrl; +import nu.marginalia.client.Context; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.search.command.IndexCommand; +import nu.marginalia.search.svc.*; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.MetricsServer; +import nu.marginalia.service.server.Service; +import nu.marginalia.service.server.StaticResources; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -22,27 +22,27 @@ import spark.Spark; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; -public class EdgeSearchService extends Service { +public class SearchService extends Service { private final WebsiteUrl websiteUrl; private StaticResources staticResources; - private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class); + private static final Logger logger = LoggerFactory.getLogger(SearchService.class); @SneakyThrows @Inject - public EdgeSearchService(@Named("service-host") String ip, - @Named("service-port") Integer port, - Initialization initialization, - MetricsServer metricsServer, - WebsiteUrl websiteUrl, - StaticResources staticResources, - IndexCommand indexCommand, - EdgeSearchErrorPageService errorPageService, - EdgeSearchAddToCrawlQueueService addToCrawlQueueService, - EdgeSearchFlagSiteService flagSiteService, - EdgeSearchQueryService searchQueryService, - EdgeSearchApiQueryService apiQueryService + public SearchService(@Named("service-host") String ip, + @Named("service-port") Integer port, + Initialization initialization, + MetricsServer metricsServer, + WebsiteUrl websiteUrl, + StaticResources staticResources, + IndexCommand indexCommand, + SearchErrorPageService errorPageService, + SearchAddToCrawlQueueService addToCrawlQueueService, + SearchFlagSiteService flagSiteService, + SearchQueryService searchQueryService, + SearchApiQueryService apiQueryService ) { super(ip, port, initialization, metricsServer); @@ -79,7 +79,7 @@ public class EdgeSearchService extends Service { private Object serveStatic(Request request, Response response) { String resource = request.params("resource"); - staticResources.serveStatic("edge", resource, request, response); + staticResources.serveStatic("search", resource, request, response); return ""; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/CommandEvaluator.java b/services-core/search-service/src/main/java/nu/marginalia/search/command/CommandEvaluator.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/CommandEvaluator.java rename to services-core/search-service/src/main/java/nu/marginalia/search/command/CommandEvaluator.java index 6d5aa46c..b9cfc852 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/CommandEvaluator.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/command/CommandEvaluator.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.search.command; +package nu.marginalia.search.command; import com.google.inject.Inject; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.search.command.commands.*; +import nu.marginalia.search.command.commands.*; +import nu.marginalia.client.Context; import java.util.ArrayList; import java.util.List; diff --git a/services-core/search-service/src/main/java/nu/marginalia/search/command/IndexCommand.java b/services-core/search-service/src/main/java/nu/marginalia/search/command/IndexCommand.java new file mode 100644 index 00000000..83ed4a64 --- /dev/null +++ b/services-core/search-service/src/main/java/nu/marginalia/search/command/IndexCommand.java @@ -0,0 +1,29 @@ +package nu.marginalia.search.command; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.browse.model.BrowseResultSet; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; +import spark.Request; +import spark.Response; + +import java.io.IOException; +import java.util.Collections; + +@Singleton +public class IndexCommand { + + private final MustacheRenderer template; + @Inject + public IndexCommand(RendererFactory rendererFactory) throws IOException { + + template = rendererFactory.renderer("search/index"); + } + + public String render(Request request, Response response) { + response.header("Cache-control", "public,max-age=3600"); + + return template.render(new BrowseResultSet(Collections.emptyList())); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchCommandInterface.java b/services-core/search-service/src/main/java/nu/marginalia/search/command/SearchCommandInterface.java similarity index 60% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchCommandInterface.java rename to services-core/search-service/src/main/java/nu/marginalia/search/command/SearchCommandInterface.java index 8334b03d..3e2304b7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchCommandInterface.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/command/SearchCommandInterface.java @@ -1,6 +1,7 @@ -package nu.marginalia.wmsa.edge.search.command; +package nu.marginalia.search.command; -import nu.marginalia.wmsa.configuration.server.Context; + +import nu.marginalia.client.Context; import java.util.Optional; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchJsParameter.java b/services-core/search-service/src/main/java/nu/marginalia/search/command/SearchJsParameter.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchJsParameter.java rename to services-core/search-service/src/main/java/nu/marginalia/search/command/SearchJsParameter.java index f42b3525..0efa224d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchJsParameter.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/command/SearchJsParameter.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.search.command; +package nu.marginalia.search.command; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; +import nu.marginalia.index.client.model.query.EdgeSearchSubquery; import javax.annotation.Nullable; import java.util.Arrays; diff --git a/services-core/search-service/src/main/java/nu/marginalia/search/command/SearchParameters.java b/services-core/search-service/src/main/java/nu/marginalia/search/command/SearchParameters.java new file mode 100644 index 00000000..58c64dd6 --- /dev/null +++ b/services-core/search-service/src/main/java/nu/marginalia/search/command/SearchParameters.java @@ -0,0 +1,9 @@ +package nu.marginalia.search.command; + +import nu.marginalia.search.model.SearchProfile; + +public record SearchParameters(SearchProfile profile, SearchJsParameter js, boolean detailedResults) { + public String profileStr() { + return profile.name; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BangCommand.java b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/BangCommand.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BangCommand.java rename to services-core/search-service/src/main/java/nu/marginalia/search/command/commands/BangCommand.java index afb22d1a..5153dea8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BangCommand.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/BangCommand.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.search.command.commands; +package nu.marginalia.search.command.commands; import com.google.inject.Inject; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; -import nu.marginalia.wmsa.edge.search.command.SearchParameters; -import nu.marginalia.wmsa.edge.search.exceptions.RedirectException; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.client.Context; +import nu.marginalia.search.exceptions.RedirectException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java similarity index 61% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java rename to services-core/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java index e1d256d8..b6f009da 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java @@ -1,17 +1,20 @@ -package nu.marginalia.wmsa.edge.search.command.commands; +package nu.marginalia.search.command.commands; import com.google.inject.Inject; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; -import nu.marginalia.wmsa.edge.search.command.SearchParameters; -import nu.marginalia.wmsa.edge.search.model.BrowseResult; -import nu.marginalia.wmsa.edge.search.model.BrowseResultSet; -import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; +import nu.marginalia.browse.DbBrowseDomainsRandom; +import nu.marginalia.browse.DbBrowseDomainsSimilarCosine; +import nu.marginalia.browse.DbBrowseDomainsSimilarOldAlgo; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.browse.model.BrowseResultSet; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.dbcommon.DbDomainQueries; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.results.BrowseResultCleaner; +import nu.marginalia.client.Context; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,7 +24,10 @@ import java.util.function.Predicate; import java.util.regex.Pattern; public class BrowseCommand implements SearchCommandInterface { - private final EdgeDataStoreDao edgeDataStoreDao; + private final DbBrowseDomainsRandom randomDomains; + private final DbBrowseDomainsSimilarCosine similarDomains; + private final DbBrowseDomainsSimilarOldAlgo similarDomainsOld; + private final DbDomainQueries domainQueries; private final EdgeDomainBlacklist blacklist; private final MustacheRenderer browseResultsRenderer; private final BrowseResultCleaner browseResultCleaner; @@ -29,17 +35,22 @@ public class BrowseCommand implements SearchCommandInterface { private final Predicate queryPatternPredicate = Pattern.compile("^browse:[.A-Za-z\\-0-9:]+$").asPredicate(); @Inject - public BrowseCommand(EdgeDataStoreDao edgeDataStoreDao, + public BrowseCommand(DbBrowseDomainsRandom randomDomains, + DbBrowseDomainsSimilarCosine similarDomains, + DbBrowseDomainsSimilarOldAlgo similarDomainsOld, DbDomainQueries domainQueries, EdgeDomainBlacklist blacklist, RendererFactory rendererFactory, BrowseResultCleaner browseResultCleaner) throws IOException { - this.edgeDataStoreDao = edgeDataStoreDao; + this.randomDomains = randomDomains; + this.similarDomains = similarDomains; + this.similarDomainsOld = similarDomainsOld; + this.domainQueries = domainQueries; this.blacklist = blacklist; this.browseResultCleaner = browseResultCleaner; - browseResultsRenderer = rendererFactory.renderer("edge/browse-results"); + browseResultsRenderer = rendererFactory.renderer("search/browse-results"); } @Override @@ -76,7 +87,7 @@ public class BrowseCommand implements SearchCommandInterface { } private BrowseResultSet getRandomEntries(int set) { - var results = edgeDataStoreDao.getRandomDomains(25, blacklist, set); + var results = randomDomains.getRandomDomains(25, blacklist, set); results.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); @@ -84,15 +95,15 @@ public class BrowseCommand implements SearchCommandInterface { } private BrowseResultSet getRelatedEntries(String word) { - var domain = edgeDataStoreDao.getDomainId(new EdgeDomain(word)); + var domain = domainQueries.getDomainId(new EdgeDomain(word)); - var neighbors = edgeDataStoreDao.getDomainNeighborsAdjacentCosine(domain, blacklist, 256); + var neighbors = similarDomains.getDomainNeighborsAdjacentCosine(domain, blacklist, 256); neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); // If the results are very few, supplement with the alternative shitty algorithm if (neighbors.size() < 25) { Set allNeighbors = new HashSet<>(neighbors); - allNeighbors.addAll(edgeDataStoreDao.getDomainNeighborsAdjacent(domain, blacklist, 50)); + allNeighbors.addAll(similarDomainsOld.getDomainNeighborsAdjacent(domain, blacklist, 50)); neighbors.clear(); neighbors.addAll(allNeighbors); diff --git a/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/ConvertCommand.java b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/ConvertCommand.java new file mode 100644 index 00000000..d022efac --- /dev/null +++ b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/ConvertCommand.java @@ -0,0 +1,35 @@ +package nu.marginalia.search.command.commands; + +import com.google.inject.Inject; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.svc.SearchUnitConversionService; +import nu.marginalia.client.Context; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; + +import java.io.IOException; +import java.util.Map; +import java.util.Optional; + +public class ConvertCommand implements SearchCommandInterface { + private final SearchUnitConversionService searchUnitConversionService; + private final MustacheRenderer> conversionRenderer; + + @Inject + public ConvertCommand(SearchUnitConversionService searchUnitConversionService, RendererFactory rendererFactory) throws IOException { + this.searchUnitConversionService = searchUnitConversionService; + + conversionRenderer = rendererFactory.renderer("search/conversion-results"); + } + + @Override + public Optional process(Context ctx, SearchParameters parameters, String query) { + var conversion = searchUnitConversionService.tryConversion(ctx, query); + if (conversion.isEmpty()) { + return Optional.empty(); + } + + return Optional.of(conversionRenderer.render(Map.of("query", query, "result", conversion.get(), "profile", parameters.profileStr()))); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/DefinitionCommand.java b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/DefinitionCommand.java similarity index 74% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/DefinitionCommand.java rename to services-core/search-service/src/main/java/nu/marginalia/search/command/commands/DefinitionCommand.java index d166fe30..efd99447 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/DefinitionCommand.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/DefinitionCommand.java @@ -1,15 +1,15 @@ -package nu.marginalia.wmsa.edge.search.command.commands; +package nu.marginalia.search.command.commands; import com.google.inject.Inject; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.assistant.client.AssistantClient; -import nu.marginalia.wmsa.edge.assistant.dict.DictionaryResponse; -import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; -import nu.marginalia.wmsa.edge.search.command.SearchParameters; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; +import nu.marginalia.assistant.client.AssistantClient; +import nu.marginalia.assistant.client.model.DictionaryResponse; +import nu.marginalia.client.Context; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,7 +33,7 @@ public class DefinitionCommand implements SearchCommandInterface { throws IOException { - dictionaryRenderer = rendererFactory.renderer("edge/dictionary-results"); + dictionaryRenderer = rendererFactory.renderer("search/dictionary-results"); this.assistantClient = assistantClient; } diff --git a/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SearchCommand.java b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SearchCommand.java new file mode 100644 index 00000000..63f12470 --- /dev/null +++ b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SearchCommand.java @@ -0,0 +1,51 @@ +package nu.marginalia.search.command.commands; + +import com.google.inject.Inject; +import nu.marginalia.client.Context; +import nu.marginalia.model.dbcommon.DbDomainQueries; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.search.SearchOperator; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.model.DecoratedSearchResults; +import nu.marginalia.search.model.UrlDetails; +import nu.marginalia.search.query.model.UserSearchParameters; +import nu.marginalia.search.results.BrowseResultCleaner; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; + +import java.io.IOException; +import java.util.Optional; + +public class SearchCommand implements SearchCommandInterface { + private final EdgeDomainBlacklist blacklist; + private final SearchOperator searchOperator; + private final MustacheRenderer searchResultsRenderer; + + + @Inject + public SearchCommand(EdgeDomainBlacklist blacklist, + SearchOperator searchOperator, + RendererFactory rendererFactory + ) throws IOException { + this.blacklist = blacklist; + this.searchOperator = searchOperator; + + searchResultsRenderer = rendererFactory.renderer("search/search-results"); + } + + @Override + public Optional process(Context ctx, SearchParameters parameters, String query) { + UserSearchParameters params = new UserSearchParameters(query, parameters.profile(), parameters.js()); + + DecoratedSearchResults results = searchOperator.doSearch(ctx, params); + + results.results.removeIf(this::isBlacklisted); + + return Optional.of(searchResultsRenderer.render(results)); + } + + private boolean isBlacklisted(UrlDetails details) { + return blacklist.isBlacklisted(details.domainId); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java similarity index 66% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java rename to services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java index 11a564b4..8bce97d1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java @@ -1,17 +1,17 @@ -package nu.marginalia.wmsa.edge.search.command.commands; +package nu.marginalia.search.command.commands; import com.google.inject.Inject; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; -import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; -import nu.marginalia.wmsa.edge.search.command.SearchParameters; -import nu.marginalia.wmsa.edge.search.model.DomainInformation; -import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; -import nu.marginalia.wmsa.edge.search.siteinfo.DomainInformationService; -import nu.marginalia.wmsa.edge.search.svc.EdgeSearchQueryIndexService; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; +import nu.marginalia.model.dbcommon.DbDomainQueries; +import nu.marginalia.search.model.UrlDetails; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.model.DomainInformation; +import nu.marginalia.search.model.SearchProfile; +import nu.marginalia.search.siteinfo.DomainInformationService; +import nu.marginalia.search.svc.SearchQueryIndexService; +import nu.marginalia.client.Context; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -22,9 +22,9 @@ import java.util.function.Predicate; import java.util.regex.Pattern; public class SiteListCommand implements SearchCommandInterface { - private final EdgeDataStoreDao dataStoreDao; + private final DbDomainQueries domainQueries; private final DomainInformationService domainInformationService; - private final EdgeSearchQueryIndexService searchQueryIndexService; + private final SearchQueryIndexService searchQueryIndexService; private final Logger logger = LoggerFactory.getLogger(getClass()); private final MustacheRenderer siteInfoRenderer; @@ -34,15 +34,15 @@ public class SiteListCommand implements SearchCommandInterface { @Inject public SiteListCommand( DomainInformationService domainInformationService, - EdgeDataStoreDao dataStoreDao, + DbDomainQueries domainQueries, RendererFactory rendererFactory, - EdgeSearchQueryIndexService searchQueryIndexService) + SearchQueryIndexService searchQueryIndexService) throws IOException { - this.dataStoreDao = dataStoreDao; + this.domainQueries = domainQueries; this.domainInformationService = domainInformationService; - siteInfoRenderer = rendererFactory.renderer("edge/site-info"); + siteInfoRenderer = rendererFactory.renderer("search/site-info"); this.searchQueryIndexService = searchQueryIndexService; } @@ -55,12 +55,12 @@ public class SiteListCommand implements SearchCommandInterface { var results = siteInfo(ctx, query); var domain = results.getDomain(); - List resultSet; + List resultSet; Path screenshotPath = null; Integer domainId = -1; if (null != domain) { - resultSet = searchQueryIndexService.performDumbQuery(ctx, EdgeSearchProfile.CORPO, 100, 100, "site:"+domain); - domainId = dataStoreDao.getDomainId(domain).id(); + resultSet = searchQueryIndexService.performDumbQuery(ctx, SearchProfile.CORPO, 100, 100, "site:"+domain); + domainId = domainQueries.getDomainId(domain).id(); screenshotPath = Path.of("/screenshot/" + domainId); } else { diff --git a/services-core/search-service/src/main/java/nu/marginalia/search/db/DbUrlDetailsQuery.java b/services-core/search-service/src/main/java/nu/marginalia/search/db/DbUrlDetailsQuery.java new file mode 100644 index 00000000..25bec7d4 --- /dev/null +++ b/services-core/search-service/src/main/java/nu/marginalia/search/db/DbUrlDetailsQuery.java @@ -0,0 +1,111 @@ +package nu.marginalia.search.db; + +import com.google.common.base.Strings; +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.id.EdgeId; +import nu.marginalia.model.id.EdgeIdCollection; +import nu.marginalia.search.model.PageScoreAdjustment; +import nu.marginalia.search.model.UrlDetails; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.*; + + +public class DbUrlDetailsQuery { + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final Cache> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build(); + + public static double QUALITY_LOWER_BOUND_CUTOFF = -15.; + @Inject + public DbUrlDetailsQuery(HikariDataSource dataSource) + { + this.dataSource = dataSource; + } + + + public synchronized void clearCaches() + { + urlIdCache.invalidateAll(); + } + + private String idList(EdgeIdCollection ids) { + StringJoiner j = new StringJoiner(",", "(", ")"); + for (var id : ids.values()) { + j.add(Integer.toString(id)); + } + return j.toString(); + } + + @SneakyThrows + public List getUrlDetailsMulti(EdgeIdCollection ids) { + if (ids.isEmpty()) { + return Collections.emptyList(); + } + List result = new ArrayList<>(ids.size()); + + try (var connection = dataSource.getConnection()) { + + String idString = idList(ids); + + try (var stmt = connection.prepareStatement( + """ + SELECT ID, DOMAIN_ID, URL, + TITLE, DESCRIPTION, + QUALITY, + WORDS_TOTAL, FORMAT, FEATURES, + IP, DOMAIN_STATE, + DATA_HASH + FROM EC_URL_VIEW + WHERE TITLE IS NOT NULL + AND ID IN + """ + idString)) { + stmt.setFetchSize(ids.size()); + + var rsp = stmt.executeQuery(); + while (rsp.next()) { + var val = new UrlDetails(rsp.getInt(1), + rsp.getInt(2), + new EdgeUrl(rsp.getString(3)), + rsp.getString(4), // title + rsp.getString(5), // description + rsp.getDouble(6), // quality + rsp.getInt(7), // wordsTotal + rsp.getString(8), // format + rsp.getInt(9), // features + rsp.getString(10), // ip + EdgeDomainIndexingState.valueOf(rsp.getString(11)), // domainState + rsp.getLong(12), // dataHash + PageScoreAdjustment.zero(), // urlQualityAdjustment + Integer.MAX_VALUE, // rankingId + Double.MAX_VALUE, // termScore + 1, // resultsFromSameDomain + "", // positions + null // result item + ); + if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF + && Strings.isNullOrEmpty(val.description) + && val.url.path.length() > 1) { + continue; + } + result.add(val); + + } + } + } + + return result; + } + + + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/exceptions/RedirectException.java b/services-core/search-service/src/main/java/nu/marginalia/search/exceptions/RedirectException.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/exceptions/RedirectException.java rename to services-core/search-service/src/main/java/nu/marginalia/search/exceptions/RedirectException.java index fc551964..eb04a4cb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/exceptions/RedirectException.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/exceptions/RedirectException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.search.exceptions; +package nu.marginalia.search.exceptions; public class RedirectException extends RuntimeException { public final String newUrl; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResults.java b/services-core/search-service/src/main/java/nu/marginalia/search/model/DecoratedSearchResults.java similarity index 56% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResults.java rename to services-core/search-service/src/main/java/nu/marginalia/search/model/DecoratedSearchResults.java index 3d4acda8..b841a488 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResults.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/model/DecoratedSearchResults.java @@ -1,23 +1,20 @@ -package nu.marginalia.wmsa.edge.search.model; +package nu.marginalia.search.model; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Getter; -import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; -import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; -import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.search.query.model.UserSearchParameters; import java.util.List; @AllArgsConstructor @Getter @Builder public class DecoratedSearchResults { - private final EdgeUserSearchParameters params; + private final UserSearchParameters params; private final List problems; private final String evalResult; - private final WikiArticles wiki; - public final List results; - public final List domainResults; + public final List results; private final String focusDomain; private final int focusDomainId; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java b/services-core/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java similarity index 82% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java rename to services-core/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java index 949c9e5f..85bb438c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.search.model; +package nu.marginalia.search.model; import lombok.*; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.model.EdgeDomain; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgePageScoreAdjustment.java b/services-core/search-service/src/main/java/nu/marginalia/search/model/PageScoreAdjustment.java similarity index 71% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgePageScoreAdjustment.java rename to services-core/search-service/src/main/java/nu/marginalia/search/model/PageScoreAdjustment.java index e0cda818..7bff0296 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgePageScoreAdjustment.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/model/PageScoreAdjustment.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.model.search; +package nu.marginalia.search.model; import lombok.Builder; import lombok.Getter; @Getter @Builder -public class EdgePageScoreAdjustment { +public class PageScoreAdjustment { final double titleAdj; final double titleFullHit; final double urlAdj; @@ -13,8 +13,8 @@ public class EdgePageScoreAdjustment { final double descAdj; final double descHitsAdj; - private static final EdgePageScoreAdjustment zero = new EdgePageScoreAdjustment(0,0, 0,0,0, 0); - public static EdgePageScoreAdjustment zero() { + private static final PageScoreAdjustment zero = new PageScoreAdjustment(0,0, 0,0,0, 0); + public static PageScoreAdjustment zero() { return zero; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchProfile.java b/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchProfile.java rename to services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java index d04f7bd6..0b116986 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchProfile.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.search.model; +package nu.marginalia.search.model; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; -import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; +import nu.marginalia.index.query.limit.SpecificationLimit; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.index.client.model.query.EdgeSearchSubquery; +import nu.marginalia.index.client.model.query.SearchSetIdentifier; import java.util.Objects; -public enum EdgeSearchProfile { +public enum SearchProfile { DEFAULT("default", SearchSetIdentifier.RETRO), MODERN("modern", SearchSetIdentifier.SMALLWEB), @@ -27,13 +27,13 @@ public enum EdgeSearchProfile { public final String name; public final SearchSetIdentifier searchSetIdentifier; - EdgeSearchProfile(String name, SearchSetIdentifier searchSetIdentifier) { + SearchProfile(String name, SearchSetIdentifier searchSetIdentifier) { this.name = name; this.searchSetIdentifier = searchSetIdentifier; } - private final static EdgeSearchProfile[] values = values(); - public static EdgeSearchProfile getSearchProfile(String param) { + private final static SearchProfile[] values = values(); + public static SearchProfile getSearchProfile(String param) { if (null == param) { return YOLO; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchRankingSymbols.java b/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchRankingSymbols.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchRankingSymbols.java rename to services-core/search-service/src/main/java/nu/marginalia/search/model/SearchRankingSymbols.java index 989db072..e395c228 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchRankingSymbols.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchRankingSymbols.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.search.model; +package nu.marginalia.search.model; import java.util.TreeMap; -public class EdgeSearchRankingSymbols { +public class SearchRankingSymbols { private static final TreeMap symbols; static { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java b/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java rename to services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index bd1e7ade..06d077a4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -1,17 +1,19 @@ -package nu.marginalia.wmsa.edge.model.search; +package nu.marginalia.search.model; import lombok.*; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.index.client.model.results.EdgeSearchResultItem; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.crawl.HtmlFeature; import java.util.EnumSet; import java.util.Objects; import java.util.StringJoiner; @AllArgsConstructor @NoArgsConstructor @With @Getter @ToString -public class EdgeUrlDetails { +public class UrlDetails { public int id; + public int domainId; public EdgeUrl url; public String title; public String description; @@ -25,9 +27,9 @@ public class EdgeUrlDetails { public String ip; public EdgeDomainIndexingState domainState; - public int dataHash; + public long dataHash; - public EdgePageScoreAdjustment urlQualityAdjustment; + public PageScoreAdjustment urlQualityAdjustment; public long rankingId; public double termScore; @@ -86,8 +88,8 @@ public class EdgeUrlDetails { if (other == this) { return true; } - if (other instanceof EdgeUrlDetails) { - return ((EdgeUrlDetails) other).id == id; + if (other instanceof UrlDetails) { + return ((UrlDetails) other).id == id; } return false; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/NearQueryProcessor.java b/services-core/search-service/src/main/java/nu/marginalia/search/query/NearQueryProcessor.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/NearQueryProcessor.java rename to services-core/search-service/src/main/java/nu/marginalia/search/query/NearQueryProcessor.java index 60b49d37..7a5f6025 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/NearQueryProcessor.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/query/NearQueryProcessor.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.search.query; +package nu.marginalia.search.query; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java rename to services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java index 952e0fb2..7ca0a0c5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java @@ -1,19 +1,25 @@ -package nu.marginalia.wmsa.edge.search.query; +package nu.marginalia.search.query; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.index.model.QueryLimits; -import nu.marginalia.wmsa.edge.index.model.QueryStrategy; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; -import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; -import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; -import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; -import nu.marginalia.wmsa.edge.search.valuation.SearchResultValuator; +import nu.marginalia.LanguageModels; +import nu.marginalia.index.client.model.query.EdgeSearchSpecification; +import nu.marginalia.index.client.model.query.EdgeSearchSubquery; +import nu.marginalia.index.query.limit.QueryLimits; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.query.limit.SpecificationLimit; +import nu.marginalia.language.statistics.EnglishDictionary; +import nu.marginalia.language.statistics.NGramBloomFilter; +import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.query_parser.QueryParser; +import nu.marginalia.query_parser.QueryPermutation; +import nu.marginalia.query_parser.QueryVariants; +import nu.marginalia.query_parser.token.Token; +import nu.marginalia.query_parser.token.TokenType; +import nu.marginalia.search.query.model.SearchQuery; +import nu.marginalia.search.query.model.UserSearchParameters; +import nu.marginalia.language.WordPatterns; +import nu.marginalia.search.valuation.SearchResultValuator; import org.eclipse.jetty.http.HttpStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,6 +38,9 @@ public class QueryFactory { private static final int RETAIN_QUERY_VARIANT_COUNT = 5; private final ThreadLocal queryVariants; + private final QueryParser queryParser = new QueryParser(); + + @Inject public QueryFactory(LanguageModels lm, TermFrequencyDict dict, @@ -48,11 +57,15 @@ public class QueryFactory { } public QueryParser getParser() { - return new QueryParser(englishDictionary, queryVariants.get()); + return new QueryParser(); } - public EdgeSearchQuery createQuery(EdgeUserSearchParameters params) { - final var processedQuery = createQuery(getParser(), params); + public QueryPermutation getQueryPermutation() { + return new QueryPermutation(queryVariants.get()); + } + + public SearchQuery createQuery(UserSearchParameters params) { + final var processedQuery = createQuery(getQueryPermutation(), params); final List subqueries = processedQuery.specs.subqueries; for (var sq : subqueries) { @@ -71,8 +84,8 @@ public class QueryFactory { } } - public EdgeSearchQuery createQuery(QueryParser queryParser, - EdgeUserSearchParameters params) + public SearchQuery createQuery(QueryPermutation queryPermutation, + UserSearchParameters params) { final var query = params.humanQuery(); final var profile = params.profile(); @@ -125,7 +138,7 @@ public class QueryFactory { } } - var queryPermutations = queryParser.permuteQueriesNew(basicQuery); + var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery); List subqueries = new ArrayList<>(); String near = profile.getNearDomain(); @@ -212,7 +225,7 @@ public class QueryFactory { EdgeSearchSpecification specs = specsBuilder.build(); - return new EdgeSearchQuery(specs, searchTermsHuman, domain); + return new SearchQuery(specs, searchTermsHuman, domain); } private SpecificationLimit parseSpecificationLimit(String str) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/model/EdgeSearchQuery.java b/services-core/search-service/src/main/java/nu/marginalia/search/query/model/SearchQuery.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/model/EdgeSearchQuery.java rename to services-core/search-service/src/main/java/nu/marginalia/search/query/model/SearchQuery.java index 2e3a246c..615c888e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/model/EdgeSearchQuery.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/query/model/SearchQuery.java @@ -1,19 +1,19 @@ -package nu.marginalia.wmsa.edge.search.query.model; +package nu.marginalia.search.query.model; import lombok.AllArgsConstructor; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; +import nu.marginalia.index.client.model.query.EdgeSearchSpecification; import java.util.*; @AllArgsConstructor -public class EdgeSearchQuery { +public class SearchQuery { public final EdgeSearchSpecification specs; public final Set problems = new TreeSet<>(); public final List searchTermsHuman; public String domain; - public EdgeSearchQuery(EdgeSearchSpecification justSpecs) { + public SearchQuery(EdgeSearchSpecification justSpecs) { searchTermsHuman = new ArrayList<>(); specs = justSpecs; } diff --git a/services-core/search-service/src/main/java/nu/marginalia/search/query/model/UserSearchParameters.java b/services-core/search-service/src/main/java/nu/marginalia/search/query/model/UserSearchParameters.java new file mode 100644 index 00000000..94936416 --- /dev/null +++ b/services-core/search-service/src/main/java/nu/marginalia/search/query/model/UserSearchParameters.java @@ -0,0 +1,7 @@ +package nu.marginalia.search.query.model; + +import nu.marginalia.search.command.SearchJsParameter; +import nu.marginalia.search.model.SearchProfile; + +public record UserSearchParameters(String humanQuery, SearchProfile profile, SearchJsParameter jsSetting) { +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/BrowseResultCleaner.java b/services-core/search-service/src/main/java/nu/marginalia/search/results/BrowseResultCleaner.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/BrowseResultCleaner.java rename to services-core/search-service/src/main/java/nu/marginalia/search/results/BrowseResultCleaner.java index e178171c..d56094d8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/BrowseResultCleaner.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/results/BrowseResultCleaner.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.search.results; +package nu.marginalia.search.results; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; -import nu.marginalia.wmsa.edge.model.id.EdgeId; -import nu.marginalia.wmsa.edge.search.model.BrowseResult; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.screenshot.ScreenshotService; +import nu.marginalia.model.id.EdgeId; import java.util.HashSet; import java.util.Set; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java b/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java similarity index 60% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java rename to services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java index 6eca931a..517c8e8a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java @@ -1,17 +1,17 @@ -package nu.marginalia.wmsa.edge.search.results; +package nu.marginalia.search.results; import com.google.inject.Inject; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntObjectHashMap; import it.unimi.dsi.fastutil.ints.Int2IntArrayMap; +import nu.marginalia.search.db.DbUrlDetailsQuery; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.id.EdgeIdList; +import nu.marginalia.index.client.model.results.EdgeSearchResultItem; +import nu.marginalia.search.model.UrlDetails; +import nu.marginalia.search.valuation.SearchResultValuator; import nu.marginalia.util.BrailleBlockPunchCards; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; -import nu.marginalia.wmsa.edge.model.id.EdgeIdList; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; -import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; -import nu.marginalia.wmsa.edge.search.valuation.SearchResultValuator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -19,32 +19,30 @@ import java.util.ArrayList; import java.util.List; public class SearchResultDecorator { - private final EdgeDataStoreDao edgeDataStoreDao; + private final DbUrlDetailsQuery dbUrlDetailsQuery; private final SearchResultValuator valuator; private final Logger logger = LoggerFactory.getLogger(getClass()); - private final boolean dumpTermData = Boolean.getBoolean("search-dump-term-data"); - @Inject - public SearchResultDecorator(EdgeDataStoreDao edgeDataStoreDao, SearchResultValuator valuator) { - this.edgeDataStoreDao = edgeDataStoreDao; + public SearchResultDecorator(DbUrlDetailsQuery dbUrlDetailsQuery, SearchResultValuator valuator) { + this.dbUrlDetailsQuery = dbUrlDetailsQuery; this.valuator = valuator; } - public List getAllUrlDetails(List resultItems) { - TIntObjectHashMap detailsById = new TIntObjectHashMap<>(resultItems.size()); + public List getAllUrlDetails(List resultItems) { + TIntObjectHashMap detailsById = new TIntObjectHashMap<>(resultItems.size()); EdgeIdList idList = resultItems.stream() .mapToInt(EdgeSearchResultItem::getUrlIdInt) .collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll); - List ret = edgeDataStoreDao.getUrlDetailsMulti(idList); + List ret = dbUrlDetailsQuery.getUrlDetailsMulti(idList); for (var val : ret) { detailsById.put(val.id, val); } - List retList = new ArrayList<>(resultItems.size()); + List retList = new ArrayList<>(resultItems.size()); TIntArrayList missedIds = new TIntArrayList(); for (var resultItem : resultItems) { @@ -62,11 +60,9 @@ public class SearchResultDecorator { details.resultsFromSameDomain = resultItem.resultsFromDomain; details.termScore = calculateTermScore(resultItem, details); - details.positions = getPositions(resultItem); + details.positions = getPositionsString(resultItem); details.resultItem = resultItem; - logger.debug("{} -> {}", details.url, details.termScore); - retList.add(details); } if (!missedIds.isEmpty()) { @@ -76,7 +72,7 @@ public class SearchResultDecorator { return retList; } - private String getPositions(EdgeSearchResultItem resultItem) { + private String getPositionsString(EdgeSearchResultItem resultItem) { Int2IntArrayMap positionsPerSet = new Int2IntArrayMap(8); for (var score : resultItem.scores) { @@ -99,21 +95,10 @@ public class SearchResultDecorator { return a | b; } - private double calculateTermScore(EdgeSearchResultItem resultItem, EdgeUrlDetails details) { + private double calculateTermScore(EdgeSearchResultItem resultItem, UrlDetails details) { final double statePenalty = (details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0; - final double value = valuator.evaluateTerms(resultItem.scores, details.words, details.title.length()); - if (dumpTermData) { - System.out.println("---"); - System.out.println(details.getUrl()); - System.out.println(details.getTitle()); - System.out.println(details.words); - for (var score : resultItem.scores) { - System.out.println(score); - } - System.out.println(value); - } return value + statePenalty; } diff --git a/services-core/search-service/src/main/java/nu/marginalia/search/results/UrlDeduplicator.java b/services-core/search-service/src/main/java/nu/marginalia/search/results/UrlDeduplicator.java new file mode 100644 index 00000000..2f0f9f02 --- /dev/null +++ b/services-core/search-service/src/main/java/nu/marginalia/search/results/UrlDeduplicator.java @@ -0,0 +1,67 @@ +package nu.marginalia.search.results; + +import gnu.trove.list.TLongList; +import gnu.trove.list.array.TLongArrayList; +import gnu.trove.map.hash.TObjectIntHashMap; +import gnu.trove.set.hash.TIntHashSet; +import nu.marginalia.search.model.UrlDetails; +import nu.marginalia.lsh.EasyLSH; +import nu.marginalia.util.BrailleBlockPunchCards; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class UrlDeduplicator { + private final int LSH_SIMILARITY_THRESHOLD = 2; + private static final Logger logger = LoggerFactory.getLogger(UrlDeduplicator.class); + + private final TIntHashSet seenSuperficialhashes = new TIntHashSet(200); + private final TLongList seehLSHList = new TLongArrayList(200); + private final TObjectIntHashMap keyCount = new TObjectIntHashMap<>(200, 0.75f, 0); + + private final int resultsPerKey; + public UrlDeduplicator(int resultsPerKey) { + this.resultsPerKey = resultsPerKey; + } + + public boolean shouldRemove(UrlDetails details) { + return !filter(details); + } + + public synchronized boolean filter(UrlDetails details) { + return deduplicateOnSuperficialHash(details) + && deduplicateOnLSH(details) + && limitResultsPerDomain(details); + } + + + private boolean deduplicateOnSuperficialHash(UrlDetails details) { + return seenSuperficialhashes.add(details.getSuperficialHash()); + } + + private boolean deduplicateOnLSH(UrlDetails details) { + long thisHash = details.dataHash; + + if (seehLSHList.forEach(otherHash -> EasyLSH.hammingDistance(thisHash, otherHash) >= LSH_SIMILARITY_THRESHOLD)) + { + seehLSHList.add(thisHash); + return true; + } + return false; + + } + + private boolean limitResultsPerDomain(UrlDetails details) { + final var domain = details.getUrl().getDomain(); + final String key; + + if (!details.isSpecialDomain()) { + key = domain.getLongDomainKey(); + } + else { + key = domain.getDomainKey(); + } + + return keyCount.adjustOrPutValue(key, 1, 1) < resultsPerKey; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java rename to services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java index dce74c28..092f2aff 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java @@ -1,12 +1,12 @@ -package nu.marginalia.wmsa.edge.search.siteinfo; +package nu.marginalia.search.siteinfo; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDaoImpl; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; -import nu.marginalia.wmsa.edge.model.id.EdgeId; -import nu.marginalia.wmsa.edge.search.model.DomainInformation; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.model.dbcommon.DbDomainQueries; +import nu.marginalia.model.id.EdgeId; +import nu.marginalia.search.model.DomainInformation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -27,15 +27,15 @@ import java.util.Optional; @Singleton public class DomainInformationService { - private EdgeDataStoreDaoImpl dataStoreDao; + private DbDomainQueries dbDomainQueries; private HikariDataSource dataSource; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject public DomainInformationService( - EdgeDataStoreDaoImpl dataStoreDao, + DbDomainQueries dbDomainQueries, HikariDataSource dataSource) { - this.dataStoreDao = dataStoreDao; + this.dbDomainQueries = dbDomainQueries; this.dataSource = dataSource; } @@ -47,7 +47,7 @@ public class DomainInformationService { return Optional.empty(); } - Optional domain = dataStoreDao.getDomain(domainId); + Optional domain = dbDomainQueries.getDomain(domainId); if (domain.isEmpty()) { return Optional.empty(); } @@ -103,7 +103,7 @@ public class DomainInformationService { private EdgeId getDomainFromPartial(String site) { try { - return dataStoreDao.getDomainId(new EdgeDomain(site)); + return dbDomainQueries.getDomainId(new EdgeDomain(site)); } catch (Exception ex) { return null; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchAddToCrawlQueueService.java b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchAddToCrawlQueueService.java similarity index 66% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchAddToCrawlQueueService.java rename to services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchAddToCrawlQueueService.java index 694ebc5e..8e4faff5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchAddToCrawlQueueService.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchAddToCrawlQueueService.java @@ -1,10 +1,10 @@ -package nu.marginalia.wmsa.edge.search.svc; +package nu.marginalia.search.svc; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.configuration.WebsiteUrl; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.model.id.EdgeId; +import nu.marginalia.WebsiteUrl; +import nu.marginalia.model.dbcommon.DbDomainQueries; +import nu.marginalia.model.id.EdgeId; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -13,18 +13,18 @@ import spark.Spark; import java.sql.SQLException; -public class EdgeSearchAddToCrawlQueueService { +public class SearchAddToCrawlQueueService { - private EdgeDataStoreDao edgeDataStoreDao; - private WebsiteUrl websiteUrl; - private HikariDataSource dataSource; - private final Logger logger = LoggerFactory.getLogger(EdgeSearchAddToCrawlQueueService.class); + private final DbDomainQueries domainQueries; + private final WebsiteUrl websiteUrl; + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(SearchAddToCrawlQueueService.class); @Inject - public EdgeSearchAddToCrawlQueueService(EdgeDataStoreDao edgeDataStoreDao, - WebsiteUrl websiteUrl, - HikariDataSource dataSource) { - this.edgeDataStoreDao = edgeDataStoreDao; + public SearchAddToCrawlQueueService(DbDomainQueries domainQueries, + WebsiteUrl websiteUrl, + HikariDataSource dataSource) { + this.domainQueries = domainQueries; this.websiteUrl = websiteUrl; this.dataSource = dataSource; } @@ -61,7 +61,7 @@ public class EdgeSearchAddToCrawlQueueService { } private String getDomainName(int id) { - var domain = edgeDataStoreDao.getDomain(new EdgeId<>(id)); + var domain = domainQueries.getDomain(new EdgeId<>(id)); if (domain.isEmpty()) Spark.halt(404); return domain.get().toString(); diff --git a/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java new file mode 100644 index 00000000..6fc031be --- /dev/null +++ b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java @@ -0,0 +1,98 @@ +package nu.marginalia.search.svc; + +import com.google.common.base.Strings; +import com.google.inject.Inject; +import lombok.SneakyThrows; +import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore; +import nu.marginalia.search.client.model.ApiSearchResultQueryDetails; +import nu.marginalia.model.idx.EdgePageWordMetadata; +import nu.marginalia.search.SearchOperator; +import nu.marginalia.search.model.UrlDetails; +import nu.marginalia.search.client.model.ApiSearchResult; +import nu.marginalia.search.client.model.ApiSearchResults; +import nu.marginalia.search.model.SearchProfile; +import nu.marginalia.client.Context; +import nu.marginalia.search.command.SearchJsParameter; +import nu.marginalia.search.query.model.UserSearchParameters; +import spark.Request; +import spark.Response; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class SearchApiQueryService { + private SearchOperator searchOperator; + + @Inject + public SearchApiQueryService(SearchOperator searchOperator) { + this.searchOperator = searchOperator; + } + + @SneakyThrows + public Object apiSearch(Request request, Response response) { + + final var ctx = Context.fromRequest(request); + final String queryParam = request.queryParams("query"); + final int limit; + SearchProfile profile = SearchProfile.YOLO; + + String count = request.queryParamOrDefault("count", "20"); + limit = Integer.parseInt(count); + + String index = request.queryParamOrDefault("index", "0"); + if (!Strings.isNullOrEmpty(index)) { + profile = switch (index) { + case "0" -> SearchProfile.YOLO; + case "1" -> SearchProfile.MODERN; + case "2" -> SearchProfile.DEFAULT; + case "3" -> SearchProfile.CORPO_CLEAN; + default -> SearchProfile.CORPO_CLEAN; + }; + } + + final String humanQuery = queryParam.trim(); + + var results = searchOperator.doApiSearch(ctx, new UserSearchParameters(humanQuery, profile, SearchJsParameter.DEFAULT)); + + return new ApiSearchResults("RESTRICTED", humanQuery, results.stream().map(this::convert).limit(limit).collect(Collectors.toList())); + } + + ApiSearchResult convert(UrlDetails url) { + List> details = new ArrayList<>(); + if (url.resultItem != null) { + var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(EdgeSearchResultKeywordScore::set)); + + outer: + for (var entries : bySet.values()) { + List lst = new ArrayList<>(); + for (var entry : entries) { + var metadata = new EdgePageWordMetadata(entry.encodedWordMetadata()); + if (metadata.isEmpty()) + continue outer; + + Set flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet()); + lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), metadata.count(), flags)); + } + details.add(lst); + } + } + + return new ApiSearchResult( + url.url.toString(), + url.getTitle(), + url.getDescription(), + sanitizeNaN(url.getTermScore(), -100), + details + ); + } + + private double sanitizeNaN(double value, double alternative) { + if (!Double.isFinite(value)) { + return alternative; + } + return value; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchErrorPageService.java b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchErrorPageService.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchErrorPageService.java rename to services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchErrorPageService.java index 54e75178..07e60ca2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchErrorPageService.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchErrorPageService.java @@ -1,18 +1,18 @@ -package nu.marginalia.wmsa.edge.search.svc; +package nu.marginalia.search.svc; import com.google.inject.Inject; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.client.Context; +import nu.marginalia.index.client.EdgeIndexClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Response; -public class EdgeSearchErrorPageService { +public class SearchErrorPageService { private final EdgeIndexClient indexClient; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public EdgeSearchErrorPageService(EdgeIndexClient indexClient) { + public SearchErrorPageService(EdgeIndexClient indexClient) { this.indexClient = indexClient; } @@ -45,6 +45,7 @@ public class EdgeSearchErrorPageService { } } catch (Exception ex) { + logger.warn("Error during rendering of error page", ex); rsp.body(renderError("Error processing error", """ An error has occurred, additionally, an error occurred while handling that error diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchFlagSiteService.java b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchFlagSiteService.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchFlagSiteService.java rename to services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchFlagSiteService.java index fec5c737..5eb960a5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchFlagSiteService.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchFlagSiteService.java @@ -1,9 +1,9 @@ -package nu.marginalia.wmsa.edge.search.svc; +package nu.marginalia.search.svc; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; import spark.Request; import spark.Response; import spark.Spark; @@ -17,7 +17,7 @@ import java.util.Map; import java.util.function.Function; import java.util.stream.Collectors; -public class EdgeSearchFlagSiteService { +public class SearchFlagSiteService { private final MustacheRenderer formTemplate; private final HikariDataSource dataSource; @@ -35,9 +35,9 @@ public class EdgeSearchFlagSiteService { private final Map categoryItemMap = categories.stream().collect(Collectors.toMap(CategoryItem::categoryName, Function.identity())); @Inject - public EdgeSearchFlagSiteService(RendererFactory rendererFactory, - HikariDataSource dataSource) throws IOException { - formTemplate = rendererFactory.renderer("edge/indict/indict-form"); + public SearchFlagSiteService(RendererFactory rendererFactory, + HikariDataSource dataSource) throws IOException { + formTemplate = rendererFactory.renderer("search/indict/indict-form"); this.dataSource = dataSource; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java similarity index 61% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java rename to services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java index c52584ca..c70ea2cc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java @@ -1,40 +1,52 @@ -package nu.marginalia.wmsa.edge.search.svc; +package nu.marginalia.search.svc; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.index.model.QueryLimits; -import nu.marginalia.wmsa.edge.index.model.QueryStrategy; -import nu.marginalia.wmsa.edge.model.search.*; -import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; -import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; -import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; -import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator; -import nu.marginalia.wmsa.edge.search.results.UrlDeduplicator; +import nu.marginalia.index.client.EdgeIndexClient; +import nu.marginalia.index.client.model.results.EdgeSearchResultItem; +import nu.marginalia.index.client.model.query.EdgeSearchSpecification; +import nu.marginalia.index.client.model.query.EdgeSearchSubquery; +import nu.marginalia.index.query.limit.QueryLimits; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.query.limit.SpecificationLimit; +import nu.marginalia.search.model.PageScoreAdjustment; +import nu.marginalia.search.model.UrlDetails; +import nu.marginalia.search.model.SearchProfile; +import nu.marginalia.search.results.SearchResultDecorator; +import nu.marginalia.search.results.UrlDeduplicator; +import nu.marginalia.client.Context; +import nu.marginalia.search.query.model.SearchQuery; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.*; import java.util.regex.Pattern; @Singleton -public class EdgeSearchQueryIndexService { +public class SearchQueryIndexService { + private final Logger logger = LoggerFactory.getLogger(getClass()); private final SearchResultDecorator resultDecorator; - private final Comparator resultListComparator; + private final Comparator resultListComparator; private final EdgeIndexClient indexClient; @Inject - public EdgeSearchQueryIndexService(SearchResultDecorator resultDecorator, EdgeIndexClient indexClient) { + public SearchQueryIndexService(SearchResultDecorator resultDecorator, EdgeIndexClient indexClient) { this.resultDecorator = resultDecorator; this.indexClient = indexClient; - Comparator c = Comparator.comparing(ud -> Math.round(10*(ud.getTermScore() - ud.rankingIdAdjustment()))); + Comparator c = Comparator.comparing(ud -> Math.round(10*(ud.getTermScore() - ud.rankingIdAdjustment()))); resultListComparator = c - .thenComparing(EdgeUrlDetails::getRanking) - .thenComparing(EdgeUrlDetails::getId); + .thenComparing(UrlDetails::getRanking) + .thenComparing(UrlDetails::getId); } - public List performDumbQuery(Context ctx, EdgeSearchProfile profile, int limitPerDomain, int limitTotal, String... termsInclude) { + public List performDumbQuery(Context ctx, + SearchProfile profile, + int limitPerDomain, + int limitTotal, + String... termsInclude) + { List sqs = new ArrayList<>(); sqs.add(new EdgeSearchSubquery( @@ -57,30 +69,30 @@ public class EdgeSearchQueryIndexService { .queryStrategy(QueryStrategy.AUTO) .build(); - return performQuery(ctx, new EdgeSearchQuery(specs)); + return performQuery(ctx, new SearchQuery(specs)); } - public List performQuery(Context ctx, EdgeSearchQuery processedQuery) { - + public List performQuery(Context ctx, SearchQuery processedQuery) { final List results = indexClient.query(ctx, processedQuery.specs); - final List resultList = new ArrayList<>(results.size()); + List urlDetails = resultDecorator.getAllUrlDetails(results); - for (var details : resultDecorator.getAllUrlDetails(results)) { - details = details.withUrlQualityAdjustment( - adjustScoreBasedOnQuery(details, processedQuery.specs)); + urlDetails.replaceAll(details -> + details.withUrlQualityAdjustment(adjustScoreBasedOnQuery(details, processedQuery.specs)) + ); - resultList.add(details); - } + urlDetails.sort(resultListComparator); - resultList.sort(resultListComparator); + return limitAndDeduplicateResults(processedQuery, urlDetails); + } + private List limitAndDeduplicateResults(SearchQuery processedQuery, List decoratedResults) { var limits = processedQuery.specs.queryLimits; UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain()); - List retList = new ArrayList<>(limits.resultsTotal()); + List retList = new ArrayList<>(limits.resultsTotal()); - for (var item : resultList) { + for (var item : decoratedResults) { if (retList.size() >= limits.resultsTotal()) break; @@ -94,7 +106,7 @@ public class EdgeSearchQueryIndexService { private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}"); - private EdgePageScoreAdjustment adjustScoreBasedOnQuery(EdgeUrlDetails p, EdgeSearchSpecification specs) { + private PageScoreAdjustment adjustScoreBasedOnQuery(UrlDetails p, EdgeSearchSpecification specs) { String titleLC = p.title == null ? "" : p.title.toLowerCase(); String descLC = p.description == null ? "" : p.description.toLowerCase(); String urlLC = p.url == null ? "" : p.url.path.toLowerCase(); @@ -135,7 +147,7 @@ public class EdgeSearchQueryIndexService { .sum(); } - return EdgePageScoreAdjustment.builder() + return PageScoreAdjustment.builder() .descAdj(Math.min(termCount, descHits) / (10. * termCount)) .descHitsAdj(descHitsAdj / 10.) .domainAdj(2 * Math.min(termCount, domainHits) / (double) termCount) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryService.java b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryService.java similarity index 67% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryService.java rename to services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryService.java index 41a50ee6..97151015 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryService.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryService.java @@ -1,14 +1,14 @@ -package nu.marginalia.wmsa.edge.search.svc; +package nu.marginalia.search.svc; import com.google.inject.Inject; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.WebsiteUrl; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.search.command.CommandEvaluator; -import nu.marginalia.wmsa.edge.search.command.SearchJsParameter; -import nu.marginalia.wmsa.edge.search.command.SearchParameters; -import nu.marginalia.wmsa.edge.search.exceptions.RedirectException; -import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; +import nu.marginalia.WebsiteUrl; +import nu.marginalia.search.model.SearchProfile; +import nu.marginalia.client.Context; +import nu.marginalia.search.command.CommandEvaluator; +import nu.marginalia.search.command.SearchJsParameter; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.exceptions.RedirectException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -16,17 +16,17 @@ import spark.Response; import java.util.Optional; -public class EdgeSearchQueryService { +public class SearchQueryService { private WebsiteUrl websiteUrl; - private final EdgeSearchErrorPageService errorPageService; + private final SearchErrorPageService errorPageService; private final CommandEvaluator searchCommandEvaulator; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public EdgeSearchQueryService( + public SearchQueryService( WebsiteUrl websiteUrl, - EdgeSearchErrorPageService errorPageService, + SearchErrorPageService errorPageService, CommandEvaluator searchCommandEvaulator) { this.websiteUrl = websiteUrl; this.errorPageService = errorPageService; @@ -44,11 +44,11 @@ public class EdgeSearchQueryService { return null; } - final String profileStr = Optional.ofNullable(request.queryParams("profile")).orElse(EdgeSearchProfile.YOLO.name); + final String profileStr = Optional.ofNullable(request.queryParams("profile")).orElse(SearchProfile.YOLO.name); final String humanQuery = queryParam.trim(); var params = new SearchParameters( - EdgeSearchProfile.getSearchProfile(profileStr), + SearchProfile.getSearchProfile(profileStr), SearchJsParameter.parse(request.queryParams("js")), Boolean.parseBoolean(request.queryParams("detailed")) ); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchUnitConversionService.java b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchUnitConversionService.java similarity index 86% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchUnitConversionService.java rename to services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchUnitConversionService.java index 491a1361..d2d75bcf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchUnitConversionService.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchUnitConversionService.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.search.svc; +package nu.marginalia.search.svc; -import nu.marginalia.wmsa.client.exception.RemoteException; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.assistant.client.AssistantClient; +import nu.marginalia.assistant.client.AssistantClient; +import nu.marginalia.client.exception.RemoteException; +import nu.marginalia.client.Context; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -15,7 +15,7 @@ import java.util.function.Predicate; import java.util.regex.Pattern; @Singleton -public class EdgeSearchUnitConversionService { +public class SearchUnitConversionService { private final Logger logger = LoggerFactory.getLogger(getClass()); private final Pattern conversionPattern = Pattern.compile("((\\d+|\\s+|[.()\\-^+%*/]|log[^a-z]|log2[^a-z]|sqrt[^a-z]|log10|cos[^a-z]|sin[^a-z]|tan[^a-z]|log2|pi[^a-z]|e[^a-z]|2pi[^a-z])+)\\s*([a-zA-Z][a-zA-Z^.0-9]*\\s?[a-zA-Z^.0-9]*)\\s+in\\s+([a-zA-Z^.0-9]+\\s?[a-zA-Z^.0-9]*)"); private final Predicate evalPredicate = Pattern.compile("(\\d+|\\s+|[.()\\-^+%*/]|log|log2|sqrt|log10|cos|sin|tan|pi|e|2pi)+").asMatchPredicate(); @@ -23,7 +23,7 @@ public class EdgeSearchUnitConversionService { private final AssistantClient assistantClient; @Inject - public EdgeSearchUnitConversionService(AssistantClient assistantClient) { + public SearchUnitConversionService(AssistantClient assistantClient) { this.assistantClient = assistantClient; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java b/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java rename to services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java index 3395b019..a5233ad0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java +++ b/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.search.valuation; +package nu.marginalia.search.valuation; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; +import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.model.crawl.EdgePageWordFlags; +import nu.marginalia.model.idx.EdgePageWordMetadata; +import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore; +import nu.marginalia.index.client.model.query.EdgeSearchSubquery; +import nu.marginalia.language.WordPatterns; import org.jetbrains.annotations.NotNull; import java.util.Arrays; diff --git a/marginalia_nu/src/main/resources/static/edge/about.html b/services-core/search-service/src/main/resources/static/search/about.html similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/about.html rename to services-core/search-service/src/main/resources/static/search/about.html diff --git a/marginalia_nu/src/main/resources/static/edge/changelog.html b/services-core/search-service/src/main/resources/static/search/changelog.html similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/changelog.html rename to services-core/search-service/src/main/resources/static/search/changelog.html diff --git a/marginalia_nu/src/main/resources/static/edge/crawler-ips.txt b/services-core/search-service/src/main/resources/static/search/crawler-ips.txt similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/crawler-ips.txt rename to services-core/search-service/src/main/resources/static/search/crawler-ips.txt diff --git a/marginalia_nu/src/main/resources/static/edge/error.html b/services-core/search-service/src/main/resources/static/search/error.html similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/error.html rename to services-core/search-service/src/main/resources/static/search/error.html diff --git a/marginalia_nu/src/main/resources/static/edge/favicon.ico b/services-core/search-service/src/main/resources/static/search/favicon.ico similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/favicon.ico rename to services-core/search-service/src/main/resources/static/search/favicon.ico diff --git a/marginalia_nu/src/main/resources/static/edge/known-issues.html b/services-core/search-service/src/main/resources/static/search/known-issues.html similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/known-issues.html rename to services-core/search-service/src/main/resources/static/search/known-issues.html diff --git a/marginalia_nu/src/main/resources/static/edge/maintenance.html b/services-core/search-service/src/main/resources/static/search/maintenance.html similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/maintenance.html rename to services-core/search-service/src/main/resources/static/search/maintenance.html diff --git a/marginalia_nu/src/main/resources/static/edge/notes.html b/services-core/search-service/src/main/resources/static/search/notes.html similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/notes.html rename to services-core/search-service/src/main/resources/static/search/notes.html diff --git a/marginalia_nu/src/main/resources/static/edge/opensearch.xml b/services-core/search-service/src/main/resources/static/search/opensearch.xml similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/opensearch.xml rename to services-core/search-service/src/main/resources/static/search/opensearch.xml diff --git a/marginalia_nu/src/main/resources/static/edge/robots.txt b/services-core/search-service/src/main/resources/static/search/robots.txt similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/robots.txt rename to services-core/search-service/src/main/resources/static/search/robots.txt diff --git a/marginalia_nu/src/main/resources/static/edge/style-new.css b/services-core/search-service/src/main/resources/static/search/style-new.css similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/style-new.css rename to services-core/search-service/src/main/resources/static/search/style-new.css diff --git a/marginalia_nu/src/main/resources/static/edge/tts.js b/services-core/search-service/src/main/resources/static/search/tts.js similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/tts.js rename to services-core/search-service/src/main/resources/static/search/tts.js diff --git a/marginalia_nu/src/main/resources/static/edge/wiki-clean.html b/services-core/search-service/src/main/resources/static/search/wiki-clean.html similarity index 100% rename from marginalia_nu/src/main/resources/static/edge/wiki-clean.html rename to services-core/search-service/src/main/resources/static/search/wiki-clean.html diff --git a/marginalia_nu/src/main/resources/templates/edge/browse-result.hdb b/services-core/search-service/src/main/resources/templates/search/browse-result.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/edge/browse-result.hdb rename to services-core/search-service/src/main/resources/templates/search/browse-result.hdb diff --git a/marginalia_nu/src/main/resources/templates/edge/browse-results.hdb b/services-core/search-service/src/main/resources/templates/search/browse-results.hdb similarity index 86% rename from marginalia_nu/src/main/resources/templates/edge/browse-results.hdb rename to services-core/search-service/src/main/resources/templates/search/browse-results.hdb index a6a8b0f8..82df7343 100644 --- a/marginalia_nu/src/main/resources/templates/edge/browse-results.hdb +++ b/services-core/search-service/src/main/resources/templates/search/browse-results.hdb @@ -11,10 +11,10 @@ -{{>edge/parts/search-header}} +{{>search/parts/search-header}}
- {{>edge/parts/search-form}} + {{>search/parts/search-form}}
@@ -28,7 +28,7 @@
{{/if}} -{{#each results}}{{>edge/browse-result}}{{/each}} +{{#each results}}{{>search/browse-result}}{{/each}} {{#unless focusDomain}}
@@ -45,5 +45,5 @@
-{{>edge/parts/search-footer}} +{{>search/parts/search-footer}} diff --git a/marginalia_nu/src/main/resources/templates/edge/conversion-results.hdb b/services-core/search-service/src/main/resources/templates/search/conversion-results.hdb similarity index 90% rename from marginalia_nu/src/main/resources/templates/edge/conversion-results.hdb rename to services-core/search-service/src/main/resources/templates/search/conversion-results.hdb index 50e5840b..85ff5750 100644 --- a/marginalia_nu/src/main/resources/templates/edge/conversion-results.hdb +++ b/services-core/search-service/src/main/resources/templates/search/conversion-results.hdb @@ -11,10 +11,10 @@ -{{>edge/parts/search-header}} +{{>search/parts/search-header}}
- {{>edge/parts/search-form}} + {{>search/parts/search-form}}
@@ -33,5 +33,5 @@
-{{>edge/parts/search-footer}} +{{>search/parts/search-footer}} \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/dictionary-results.hdb b/services-core/search-service/src/main/resources/templates/search/dictionary-results.hdb similarity index 92% rename from marginalia_nu/src/main/resources/templates/edge/dictionary-results.hdb rename to services-core/search-service/src/main/resources/templates/search/dictionary-results.hdb index b7888418..ed43e2a7 100644 --- a/marginalia_nu/src/main/resources/templates/edge/dictionary-results.hdb +++ b/services-core/search-service/src/main/resources/templates/search/dictionary-results.hdb @@ -11,10 +11,10 @@ -{{>edge/parts/search-header}} +{{>search/parts/search-header}}
- {{>edge/parts/search-form}} + {{>search/parts/search-form}}
{{#unless entries}} @@ -44,5 +44,5 @@
-{{>edge/parts/search-footer}} +{{>search/parts/search-footer}} diff --git a/marginalia_nu/src/main/resources/templates/edge/error-page.hdb b/services-core/search-service/src/main/resources/templates/search/error-page.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/edge/error-page.hdb rename to services-core/search-service/src/main/resources/templates/search/error-page.hdb diff --git a/marginalia_nu/src/main/resources/templates/edge/index.hdb b/services-core/search-service/src/main/resources/templates/search/index.hdb similarity index 86% rename from marginalia_nu/src/main/resources/templates/edge/index.hdb rename to services-core/search-service/src/main/resources/templates/search/index.hdb index fe30d9d6..06942ccf 100644 --- a/marginalia_nu/src/main/resources/templates/edge/index.hdb +++ b/services-core/search-service/src/main/resources/templates/search/index.hdb @@ -19,10 +19,10 @@ -{{>edge/parts/search-header}} +{{>search/parts/search-header}} -{{>edge/parts/search-footer}} +{{>search/parts/search-footer}} diff --git a/marginalia_nu/src/main/resources/templates/edge/indict/indict-form.hdb b/services-core/search-service/src/main/resources/templates/search/indict/indict-form.hdb similarity index 95% rename from marginalia_nu/src/main/resources/templates/edge/indict/indict-form.hdb rename to services-core/search-service/src/main/resources/templates/search/indict/indict-form.hdb index 2fa75eab..5a8ebbce 100644 --- a/marginalia_nu/src/main/resources/templates/edge/indict/indict-form.hdb +++ b/services-core/search-service/src/main/resources/templates/search/indict/indict-form.hdb @@ -12,10 +12,10 @@ -{{>edge/parts/search-header}} +{{>search/parts/search-header}}
-{{>edge/parts/search-form}} +{{>search/parts/search-form}}
@@ -76,5 +76,5 @@ you may also reach a human being through email at kontakt@marginalia.nu {{/if}}
-{{>edge/parts/search-footer}} +{{>search/parts/search-footer}} diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb b/services-core/search-service/src/main/resources/templates/search/parts/search-footer.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb rename to services-core/search-service/src/main/resources/templates/search/parts/search-footer.hdb diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb b/services-core/search-service/src/main/resources/templates/search/parts/search-form.hdb similarity index 95% rename from marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb rename to services-core/search-service/src/main/resources/templates/search/parts/search-form.hdb index 21c3a1a8..ecc35ead 100644 --- a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb +++ b/services-core/search-service/src/main/resources/templates/search/parts/search-form.hdb @@ -28,7 +28,7 @@ \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-header.hdb b/services-core/search-service/src/main/resources/templates/search/parts/search-header.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/edge/parts/search-header.hdb rename to services-core/search-service/src/main/resources/templates/search/parts/search-header.hdb diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/site-info-index.hdb b/services-core/search-service/src/main/resources/templates/search/parts/site-info-index.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/edge/parts/site-info-index.hdb rename to services-core/search-service/src/main/resources/templates/search/parts/site-info-index.hdb diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/site-info-links.hdb b/services-core/search-service/src/main/resources/templates/search/parts/site-info-links.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/edge/parts/site-info-links.hdb rename to services-core/search-service/src/main/resources/templates/search/parts/site-info-links.hdb diff --git a/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb b/services-core/search-service/src/main/resources/templates/search/search-result-metadata.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb rename to services-core/search-service/src/main/resources/templates/search/search-result-metadata.hdb diff --git a/marginalia_nu/src/main/resources/templates/edge/search-result.hdb b/services-core/search-service/src/main/resources/templates/search/search-result.hdb similarity index 92% rename from marginalia_nu/src/main/resources/templates/edge/search-result.hdb rename to services-core/search-service/src/main/resources/templates/search/search-result.hdb index c5d220c1..59460c8f 100644 --- a/marginalia_nu/src/main/resources/templates/edge/search-result.hdb +++ b/services-core/search-service/src/main/resources/templates/search/search-result.hdb @@ -9,7 +9,7 @@ Info {{#if hasMoreResults}}{{resultsFromSameDomain}}+{{/if}} {{/unless}} -
{{>edge/search-result-metadata}}
+
{{>search/search-result-metadata}}

diff --git a/marginalia_nu/src/main/resources/templates/edge/search-results.hdb b/services-core/search-service/src/main/resources/templates/search/search-results.hdb similarity index 89% rename from marginalia_nu/src/main/resources/templates/edge/search-results.hdb rename to services-core/search-service/src/main/resources/templates/search/search-results.hdb index b0a0848e..d6af3455 100644 --- a/marginalia_nu/src/main/resources/templates/edge/search-results.hdb +++ b/services-core/search-service/src/main/resources/templates/search/search-results.hdb @@ -14,10 +14,10 @@ -{{>edge/parts/search-header}} +{{>search/parts/search-header}}
-{{>edge/parts/search-form}} +{{>search/parts/search-form}}
@@ -42,11 +42,11 @@ {{#unless evalResult}}{{#if problems}}

Suggestions

    {{#each problems}}
  • {{{.}}}
  • {{/each}}
{{/if}}{{/unless}} - {{#each domainResults}}{{>edge/browse-result}}{{/each}} - {{#each results}}{{>edge/search-result}}{{/each}} + {{#each domainResults}}{{>search/browse-result}}{{/each}} + {{#each results}}{{>search/search-result}}{{/each}}
-{{>edge/parts/search-footer}} +{{>search/parts/search-footer}} diff --git a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb b/services-core/search-service/src/main/resources/templates/search/site-info.hdb similarity index 74% rename from marginalia_nu/src/main/resources/templates/edge/site-info.hdb rename to services-core/search-service/src/main/resources/templates/search/site-info.hdb index 6563844c..4e2ac7e2 100644 --- a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb +++ b/services-core/search-service/src/main/resources/templates/search/site-info.hdb @@ -11,10 +11,10 @@ -{{>edge/parts/search-header}} +{{>search/parts/search-header}}
- {{>edge/parts/search-form}} + {{>search/parts/search-form}}
@@ -24,14 +24,14 @@
- {{>edge/parts/site-info-index}} - {{>edge/parts/site-info-links}} + {{>search/parts/site-info-index}} + {{>search/parts/site-info-links}} - {{#each results}}{{>edge/search-result}}{{/each}} + {{#each results}}{{>search/search-result}}{{/each}}
-{{>edge/parts/search-footer}} +{{>search/parts/search-footer}} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/command/commands/BangCommandTest.java b/services-core/search-service/src/test/java/nu/marginalia/search/command/commands/BangCommandTest.java similarity index 90% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/command/commands/BangCommandTest.java rename to services-core/search-service/src/test/java/nu/marginalia/search/command/commands/BangCommandTest.java index b2b4c90b..a5de2eb2 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/command/commands/BangCommandTest.java +++ b/services-core/search-service/src/test/java/nu/marginalia/search/command/commands/BangCommandTest.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.search.command.commands; +package nu.marginalia.search.command.commands; -import nu.marginalia.wmsa.edge.search.exceptions.RedirectException; +import nu.marginalia.search.exceptions.RedirectException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryFactoryTest.java b/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java similarity index 82% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryFactoryTest.java rename to services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java index 987c258d..48842b9a 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryFactoryTest.java +++ b/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java @@ -1,14 +1,15 @@ -package nu.marginalia.wmsa.edge.search.query; +package nu.marginalia.search.query; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; -import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; -import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimitType; -import nu.marginalia.wmsa.edge.search.command.SearchJsParameter; -import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; -import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; -import nu.marginalia.wmsa.edge.search.valuation.SearchResultValuator; +import nu.marginalia.WmsaHome; +import nu.marginalia.index.query.limit.SpecificationLimitType; +import nu.marginalia.language.statistics.EnglishDictionary; +import nu.marginalia.index.client.model.query.EdgeSearchSpecification; +import nu.marginalia.language.statistics.NGramBloomFilter; +import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.search.command.SearchJsParameter; +import nu.marginalia.search.model.SearchProfile; +import nu.marginalia.search.query.model.UserSearchParameters; +import nu.marginalia.search.valuation.SearchResultValuator; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -38,7 +39,7 @@ public class QueryFactoryTest { public EdgeSearchSpecification parseAndGetSpecs(String query) { return queryFactory.createQuery( - new EdgeUserSearchParameters(query, EdgeSearchProfile.CORPO, SearchJsParameter.DEFAULT) + new UserSearchParameters(query, SearchProfile.CORPO, SearchJsParameter.DEFAULT) ).specs; } diff --git a/services-core/search-service/src/test/java/nu/marginalia/util/TestLanguageModels.java b/services-core/search-service/src/test/java/nu/marginalia/util/TestLanguageModels.java new file mode 100644 index 00000000..81df1ed9 --- /dev/null +++ b/services-core/search-service/src/test/java/nu/marginalia/util/TestLanguageModels.java @@ -0,0 +1,37 @@ +package nu.marginalia.util; + +import nu.marginalia.LanguageModels; +import nu.marginalia.WmsaHome; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; + +public class TestLanguageModels { + private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model"); + + public static Path getLanguageModelsPath() { + final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME")) + .map(Path::of) + .orElse(LANGUAGE_MODELS_DEFAULT); + + if (!Files.isDirectory(languageModelsHome)) { + throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md"); + } + return languageModelsHome; + } + + public static LanguageModels getLanguageModels() { + + var languageModelsHome = getLanguageModelsPath(); + + return new LanguageModels( + languageModelsHome.resolve("ngrams.bin"), + languageModelsHome.resolve("tfreq-new-algo3.bin"), + languageModelsHome.resolve("opennlp-sentence.bin"), + languageModelsHome.resolve("English.RDR"), + languageModelsHome.resolve("English.DICT"), + languageModelsHome.resolve("opennlp-tokens.bin") + ); + } +} diff --git a/services-satellite/api-service/build.gradle b/services-satellite/api-service/build.gradle new file mode 100644 index 00000000..afcb5e7c --- /dev/null +++ b/services-satellite/api-service/build.gradle @@ -0,0 +1,63 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'application' + id 'com.palantir.docker' version '0.34.0' + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + + +application { + mainClass = 'nu.marginalia.api.ApiMain' + applicationName = 'api-service' +} + +apply from: "$rootProject.projectDir/docker-service.gradle" + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:service') + implementation project(':common:config') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + implementation project(':api:search-api') + + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guice + implementation libs.rxjava + implementation libs.spark + implementation libs.opencsv + implementation libs.trove + implementation libs.fastutil + implementation libs.bundles.gson + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} + diff --git a/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiMain.java b/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiMain.java new file mode 100644 index 00000000..5d9f80c1 --- /dev/null +++ b/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiMain.java @@ -0,0 +1,28 @@ +package nu.marginalia.api; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.SearchServiceDescriptors; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.server.Initialization; + +public class ApiMain extends MainClass { + + @Inject + public ApiMain(ApiService service) { + } + + public static void main(String... args) { + init(ServiceId.Api, args); + + Injector injector = Guice.createInjector( + new DatabaseModule(), + new ConfigurationModule(SearchServiceDescriptors.descriptors, ServiceId.Api)); + injector.getInstance(ApiMain.class); + injector.getInstance(Initialization.class).setReady(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/ApiService.java b/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/api/ApiService.java rename to services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java index 9c897a9b..2ebe6be8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/ApiService.java +++ b/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java @@ -1,14 +1,18 @@ -package nu.marginalia.wmsa.api; +package nu.marginalia.api; import com.google.common.base.Strings; import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.name.Named; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.api.model.ApiLicense; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.server.*; -import nu.marginalia.wmsa.edge.search.client.EdgeSearchClient; +import nu.marginalia.api.model.ApiLicense; +import nu.marginalia.client.Context; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.search.client.EdgeSearchClient; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.MetricsServer; +import nu.marginalia.service.server.RateLimiter; +import nu.marginalia.service.server.Service; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -17,7 +21,7 @@ import spark.Spark; import java.util.concurrent.ConcurrentHashMap; -public class ApiService extends Service { +public class ApiService extends Service { private final Logger logger = LoggerFactory.getLogger(getClass()); private final Gson gson = GsonFactory.get(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiLicense.java b/services-satellite/api-service/src/main/java/nu/marginalia/api/model/ApiLicense.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiLicense.java rename to services-satellite/api-service/src/main/java/nu/marginalia/api/model/ApiLicense.java index 15a6ae08..32bf6691 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiLicense.java +++ b/services-satellite/api-service/src/main/java/nu/marginalia/api/model/ApiLicense.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.api.model; +package nu.marginalia.api.model; import lombok.AllArgsConstructor; import lombok.EqualsAndHashCode; diff --git a/services-satellite/dating-service/build.gradle b/services-satellite/dating-service/build.gradle new file mode 100644 index 00000000..e57e15a8 --- /dev/null +++ b/services-satellite/dating-service/build.gradle @@ -0,0 +1,66 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'application' + id 'com.palantir.docker' version '0.34.0' + id 'jvm-test-suite' +} + +application { + mainClass = 'nu.marginalia.dating.DatingMain' + applicationName = 'dating-service' +} + +apply from: "$rootProject.projectDir/docker-service.gradle" + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:service') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + implementation project(':features:renderer') + implementation project(':features:screenshots') + implementation project(':libraries:language-processing') + implementation project(':features:random-websites') + + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guice + implementation libs.rxjava + implementation libs.spark + implementation libs.opencsv + implementation libs.trove + implementation libs.fastutil + implementation libs.bundles.gson + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} + + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingMain.java b/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingMain.java similarity index 58% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingMain.java rename to services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingMain.java index 83b62e7f..35608956 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingMain.java +++ b/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingMain.java @@ -1,17 +1,16 @@ -package nu.marginalia.wmsa.edge.dating; +package nu.marginalia.dating; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.configuration.server.Initialization; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.SearchServiceDescriptors; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.server.Initialization; import spark.Spark; -import java.io.IOException; - public class DatingMain extends MainClass { final DatingService service; @@ -21,13 +20,13 @@ public class DatingMain extends MainClass { } public static void main(String... args) { - init(ServiceDescriptor.DATING, args); + init(ServiceId.Dating, args); Spark.staticFileLocation("/static/dating/"); Injector injector = Guice.createInjector( new DatingModule(), - new ConfigurationModule(), + new ConfigurationModule(SearchServiceDescriptors.descriptors, ServiceId.Dating), new DatabaseModule() ); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingModule.java b/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingModule.java similarity index 70% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingModule.java rename to services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingModule.java index b92f67ba..e861d4df 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingModule.java +++ b/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingModule.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.dating; +package nu.marginalia.dating; import com.google.inject.AbstractModule; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingService.java b/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java similarity index 82% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingService.java rename to services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java index 67753527..2bcafe4a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingService.java +++ b/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java @@ -1,18 +1,19 @@ -package nu.marginalia.wmsa.edge.dating; +package nu.marginalia.dating; import com.google.inject.Inject; import com.google.inject.name.Named; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.model.id.EdgeId; -import nu.marginalia.wmsa.edge.search.model.BrowseResult; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; +import nu.marginalia.browse.DbBrowseDomainsRandom; +import nu.marginalia.browse.DbBrowseDomainsSimilarCosine; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; +import nu.marginalia.screenshot.ScreenshotService; +import nu.marginalia.model.id.EdgeId; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.MetricsServer; +import nu.marginalia.service.server.Service; import org.jetbrains.annotations.NotNull; import spark.Request; import spark.Response; @@ -24,8 +25,9 @@ import java.util.Map; import java.util.Optional; public class DatingService extends Service { - private final EdgeDataStoreDao edgeDataStoreDao; private final EdgeDomainBlacklist blacklist; + private final DbBrowseDomainsSimilarCosine browseSimilarCosine; + private final DbBrowseDomainsRandom browseRandom; private final MustacheRenderer datingRenderer; private final ScreenshotService screenshotService; private final String SESSION_OBJECT_NAME = "so"; @@ -33,19 +35,21 @@ public class DatingService extends Service { @Inject public DatingService(@Named("service-host") String ip, @Named("service-port") Integer port, - EdgeDataStoreDao edgeDataStoreDao, RendererFactory rendererFactory, Initialization initialization, MetricsServer metricsServer, EdgeDomainBlacklist blacklist, + DbBrowseDomainsSimilarCosine browseSimilarCosine, + DbBrowseDomainsRandom browseRandom, ScreenshotService screenshotService) { super(ip, port, initialization, metricsServer); - this.edgeDataStoreDao = edgeDataStoreDao; this.blacklist = blacklist; datingRenderer = rendererFactory.renderer("dating/dating-view"); + this.browseSimilarCosine = browseSimilarCosine; + this.browseRandom = browseRandom; this.screenshotService = screenshotService; Spark.get("/public/reset", this::getReset); @@ -100,7 +104,7 @@ public class DatingService extends Service { var current = session.getCurrent(); if (current == null) { - BrowseResult res = session.next(edgeDataStoreDao, blacklist); + BrowseResult res = session.next(browseRandom, blacklist); res = findViableDomain(session, res); session.browseForward(res); current = session.getCurrent(); @@ -117,7 +121,7 @@ public class DatingService extends Service { } var session = sessionObjectOpt.get(); - BrowseResult res = session.next(edgeDataStoreDao, blacklist); + BrowseResult res = session.next(browseRandom, blacklist); res = findViableDomain(session, res); @@ -157,7 +161,7 @@ public class DatingService extends Service { var session = sessionObjectOpt.get(); int id = Integer.parseInt(request.params("id")); - BrowseResult res = session.nextSimilar(new EdgeId<>(id), edgeDataStoreDao, blacklist); + BrowseResult res = session.nextSimilar(new EdgeId<>(id), browseSimilarCosine, blacklist); res = findViableDomain(session, res); @@ -170,7 +174,7 @@ public class DatingService extends Service { @NotNull private BrowseResult findViableDomain(DatingSessionObject session, BrowseResult res) { while (!screenshotService.hasScreenshot(new EdgeId<>(res.domainId())) || session.isRecent(res)) { - res = session.next(edgeDataStoreDao, blacklist); + res = session.next(browseRandom, blacklist); } return res; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingSessionObject.java b/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingSessionObject.java rename to services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java index 39eed0a6..89d0215d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingSessionObject.java +++ b/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java @@ -1,10 +1,11 @@ -package nu.marginalia.wmsa.edge.dating; +package nu.marginalia.dating; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.id.EdgeId; -import nu.marginalia.wmsa.edge.search.model.BrowseResult; +import nu.marginalia.browse.DbBrowseDomainsRandom; +import nu.marginalia.browse.DbBrowseDomainsSimilarCosine; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.model.id.EdgeId; import java.util.LinkedList; @@ -21,15 +22,15 @@ public class DatingSessionObject { return current; } - public BrowseResult next(EdgeDataStoreDao dao, EdgeDomainBlacklist blacklist) { + public BrowseResult next(DbBrowseDomainsRandom random, EdgeDomainBlacklist blacklist) { if (queue.isEmpty()) { - dao.getRandomDomains(25, blacklist, 0).forEach(queue::addLast); + random.getRandomDomains(25, blacklist, 0).forEach(queue::addLast); } return queue.pollFirst(); } - public BrowseResult nextSimilar(EdgeId id, EdgeDataStoreDao dao, EdgeDomainBlacklist blacklist) { - dao.getDomainNeighborsAdjacent(id, blacklist, 25).forEach(queue::addFirst); + public BrowseResult nextSimilar(EdgeId id, DbBrowseDomainsSimilarCosine adjacent, EdgeDomainBlacklist blacklist) { + adjacent.getDomainNeighborsAdjacentCosine(id, blacklist, 25).forEach(queue::addFirst); while (queue.size() > MAX_QUEUE_SIZE) { queue.removeLast(); diff --git a/services-satellite/dating-service/src/main/resources/static/dating/index.html b/services-satellite/dating-service/src/main/resources/static/dating/index.html new file mode 100644 index 00000000..e69de29b diff --git a/services-satellite/dating-service/src/main/resources/static/dating/robots.txt b/services-satellite/dating-service/src/main/resources/static/dating/robots.txt new file mode 100644 index 00000000..e69de29b diff --git a/marginalia_nu/src/main/resources/templates/dating/dating-view.hdb b/services-satellite/dating-service/src/main/resources/templates/dating/dating-view.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/dating/dating-view.hdb rename to services-satellite/dating-service/src/main/resources/templates/dating/dating-view.hdb diff --git a/services-satellite/explorer-service/build.gradle b/services-satellite/explorer-service/build.gradle new file mode 100644 index 00000000..94f8305c --- /dev/null +++ b/services-satellite/explorer-service/build.gradle @@ -0,0 +1,61 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'application' + id 'com.palantir.docker' version '0.34.0' + id 'jvm-test-suite' +} + +application { + mainClass = 'nu.marginalia.explorer.ExplorerMain' + applicationName = 'explorer-service' +} + +apply from: "$rootProject.projectDir/docker-service.gradle" + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':common:service') + implementation project(':common:service-discovery') + implementation project(':common:service-client') + + implementation project(':features:renderer') + implementation project(':features:random-websites') + + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guice + implementation libs.rxjava + implementation libs.spark + implementation libs.opencsv + implementation libs.trove + implementation libs.fastutil + implementation libs.bundles.gson + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerMain.java b/services-satellite/explorer-service/src/main/java/nu/marginalia/explorer/ExplorerMain.java similarity index 57% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerMain.java rename to services-satellite/explorer-service/src/main/java/nu/marginalia/explorer/ExplorerMain.java index cf6a0d9b..4566fa62 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerMain.java +++ b/services-satellite/explorer-service/src/main/java/nu/marginalia/explorer/ExplorerMain.java @@ -1,13 +1,14 @@ -package nu.marginalia.wmsa.edge.explorer; +package nu.marginalia.explorer; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.configuration.server.Initialization; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.SearchServiceDescriptors; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.server.Initialization; import spark.Spark; public class ExplorerMain extends MainClass { @@ -19,12 +20,12 @@ public class ExplorerMain extends MainClass { } public static void main(String... args) { - init(ServiceDescriptor.EXPLORER, args); + init(ServiceId.Explorer, args); Spark.staticFileLocation("/static/explore/"); Injector injector = Guice.createInjector( - new ConfigurationModule(), + new ConfigurationModule(SearchServiceDescriptors.descriptors, ServiceId.Explorer), new DatabaseModule() ); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java b/services-satellite/explorer-service/src/main/java/nu/marginalia/explorer/ExplorerService.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java rename to services-satellite/explorer-service/src/main/java/nu/marginalia/explorer/ExplorerService.java index 38ec7b6b..8f967bb1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java +++ b/services-satellite/explorer-service/src/main/java/nu/marginalia/explorer/ExplorerService.java @@ -1,15 +1,15 @@ -package nu.marginalia.wmsa.edge.explorer; +package nu.marginalia.explorer; import com.google.inject.Inject; import com.google.inject.name.Named; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; -import nu.marginalia.wmsa.resource_store.StaticResources; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.service.server.MetricsServer; +import nu.marginalia.service.server.Service; +import nu.marginalia.service.server.StaticResources; import org.jetbrains.annotations.NotNull; import spark.Request; import spark.Response; @@ -54,18 +54,21 @@ public class ExplorerService extends Service { super(ip, port, initialization, metricsServer); renderer = rendererFactory.renderer("explorer/explorer"); + this.dataSource = dataSource; this.staticResources = staticResources; + Spark.get("/public/", this::serveIndex, this::render); Spark.get("/public/search", this::search, this::render); Spark.get("/public/:resource", this::serveStatic); - } private Object serveStatic(Request request, Response response) { String resource = request.params("resource"); + staticResources.serveStatic("explore", resource, request, response); + return ""; } diff --git a/services-satellite/explorer-service/src/main/resources/static/explore/style.css b/services-satellite/explorer-service/src/main/resources/static/explore/style.css new file mode 100644 index 00000000..e69de29b diff --git a/marginalia_nu/src/main/resources/templates/explorer/explorer-about.hdb b/services-satellite/explorer-service/src/main/resources/templates/explorer/explorer-about.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/explorer/explorer-about.hdb rename to services-satellite/explorer-service/src/main/resources/templates/explorer/explorer-about.hdb diff --git a/marginalia_nu/src/main/resources/templates/explorer/explorer-messages.hdb b/services-satellite/explorer-service/src/main/resources/templates/explorer/explorer-messages.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/explorer/explorer-messages.hdb rename to services-satellite/explorer-service/src/main/resources/templates/explorer/explorer-messages.hdb diff --git a/marginalia_nu/src/main/resources/templates/explorer/explorer-results.hdb b/services-satellite/explorer-service/src/main/resources/templates/explorer/explorer-results.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/explorer/explorer-results.hdb rename to services-satellite/explorer-service/src/main/resources/templates/explorer/explorer-results.hdb diff --git a/marginalia_nu/src/main/resources/templates/explorer/explorer-search.hdb b/services-satellite/explorer-service/src/main/resources/templates/explorer/explorer-search.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/explorer/explorer-search.hdb rename to services-satellite/explorer-service/src/main/resources/templates/explorer/explorer-search.hdb diff --git a/marginalia_nu/src/main/resources/templates/explorer/explorer.hdb b/services-satellite/explorer-service/src/main/resources/templates/explorer/explorer.hdb similarity index 100% rename from marginalia_nu/src/main/resources/templates/explorer/explorer.hdb rename to services-satellite/explorer-service/src/main/resources/templates/explorer/explorer.hdb diff --git a/services-satellite/readme.md b/services-satellite/readme.md new file mode 100644 index 00000000..f276bd5b --- /dev/null +++ b/services-satellite/readme.md @@ -0,0 +1,7 @@ +# Satellite Services + +The satellite services offer non-essential functionality. + +* The [api-service](api-service/) offers a public API +* The [dating-service](dating-service/) is [explore.marginalia.nu](https://explore.marginalia.nu/) +* The [explorer-service](dating-service/) is [explore2.marginalia.nu](https://explore2.marginalia.nu/) diff --git a/settings.gradle b/settings.gradle index 149ff1ea..8f00c50b 100644 --- a/settings.gradle +++ b/settings.gradle @@ -1,5 +1,161 @@ -rootProject.name = 'wmsa' +rootProject.name = 'marginalia.nu' -include 'marginalia_nu' -include 'third_party' -include 'protocol' \ No newline at end of file +include 'services-core:index-service' +include 'services-core:assistant-service' +include 'services-core:search-service' + +include 'services-satellite:api-service' +include 'services-satellite:dating-service' +include 'services-satellite:explorer-service' + +include 'libraries:array' +include 'libraries:btree' +include 'libraries:misc' +include 'libraries:language-processing' + +include 'features:screenshots' +include 'features:random-websites' +include 'features:domain-ranking' +include 'features:renderer' +include 'features:query-parser' + +include 'api:search-api' +include 'api:index-api' +include 'api:assistant-api' + +include 'index:lexicon' +include 'index:index-journal' +include 'index:index-query' +include 'index:index-forward' +include 'index:index-reverse' + +include 'common:service-discovery' +include 'common:service-client' +include 'common:service' +include 'common:config' +include 'common:model' + +include 'crawl:crawl-job-extractor-process' +include 'crawl:crawling-process' +include 'crawl:crawling-model' +include 'crawl:converting-process' +include 'crawl:converting-model' +include 'crawl:loading-process' +include 'crawl:common' +include 'crawl:experimental' + +include 'third-party' +include 'protocol' +include 'other:memex' +include 'other:wmsa_old' + +include 'tools:screenshot' + +dependencyResolutionManagement { + + repositories { + mavenLocal() + maven { url "https://artifactory.cronapp.io/public-release/" } + maven { url "https://repo1.maven.org/maven2/" } + maven { url "https://www2.ph.ed.ac.uk/maven2/" } + maven { url "https://jitpack.io/" } + exclusiveContent { + forRepository { + maven { + url = uri("https://jitpack.io") + } + } + filter { + // Only use JitPack for the `gson-record-type-adapter-factory` library + includeModule("com.github.Marcono1234", "gson-record-type-adapter-factory") + } + } + } + + versionCatalogs { + libs { + library('lombok', 'org.projectlombok', 'lombok').version('1.18.24') + library('mariadb-client', 'org.mariadb.jdbc', 'mariadb-java-client').version('3.0.6') + library('hikaricp', 'com.zaxxer:HikariCP:5.0.1') + + library('spark', 'com.sparkjava', 'spark-core').version('2.9.3') + library('guice', 'com.google.inject', 'guice').version('5.1.0') + library('guava', 'com.google.guava', 'guava').version('31.1-jre') + + library('rxjava', 'io.reactivex.rxjava3', 'rxjava').version('3.1.5') + + library('prometheus', 'io.prometheus', 'simpleclient').version('0.16.0') + library('prometheus-servlet', 'io.prometheus', 'simpleclient_servlet').version('0.16.0') + library('prometheus-server', 'io.prometheus', 'simpleclient_httpserver').version('0.16.0') + library('prometheus-hotspot', 'io.prometheus', 'simpleclient_hotspot').version('0.16.0') + + library('slf4j.api', 'org.slf4j', 'slf4j-api').version('1.7.36') + library('slf4j.jdk14', 'org.slf4j', 'slf4j-jdk14').version('2.0.3') + + library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.17.2') + library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.17.2') + library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j-impl').version('2.17.2') + + library('notnull','org.jetbrains','annotations').version('24.0.0') + + library('trove', 'net.sf.trove4j', 'trove4j').version('3.0.3') + library('fastutil', 'it.unimi.dsi', 'fastutil').version('8.5.8') + + library('okhttp3','com.squareup.okhttp3','okhttp').version('4.10.0') + + library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15') + library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13') + library('commons.net', 'commons-net','commons-net').version('3.9.0') + library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0') + library('commons.compress','org.apache.commons','commons-compress').version('1.21') + library('commons.io','commons-io','commons-io').version('2.11.0') + + library('ffi','com.github.jnr','jnr-ffi').version('2.2.12') + library('databind','com.fasterxml.jackson.core','jackson-databind').version('2.13.2.1') + + library('crawlercommons', 'com.github.crawler-commons', 'crawler-commons').version('1.2') + + library('stanford.corenlp','edu.stanford.nlp','stanford-corenlp').version('4.4.0') + library('opennlp','org.apache.opennlp','opennlp-tools').version('1.9.4') + + library('roaringbitmap','org.roaringbitmap','RoaringBitmap').version('0.9.32') + library('opencsv','com.opencsv','opencsv').version('5.6') + library('bucket4j','com.github.vladimir-bukhtoyarov','bucket4j-core').version('7.5.0') + + library('protobuf','com.google.protobuf','protobuf-java').version('3.0.0') + library('gson','com.google.code.gson','gson').version('2.9.0') + library('gson-type-adapter','com.github.Marcono1234','gson-record-type-adapter-factory').version('0.2.0') + + library('zstd','com.github.luben','zstd-jni').version('1.5.2-2') + library('lz4','org.lz4','lz4-java').version('1.8.0') + + library('jsoup','org.jsoup','jsoup').version('1.15.3') + library('snakeyaml','org.yaml','snakeyaml').version('1.30') + + library('junit.jupiter','org.junit.jupiter','junit-jupiter-api').version('5.8.2') + library('junit.jupiter.engine','org.junit.jupiter','junit-jupiter-engine').version('') + library('mockito','org.mockito','mockito-junit-jupiter').version('4.5.1') + + library('selenium.chrome','org.seleniumhq.selenium','selenium-chrome-driver').version('4.5.3') + library('selenium.java','org.seleniumhq.selenium','selenium-java').version('4.5.3') + + library('handlebars','com.github.jknack','handlebars').version('4.3.1') + library('handlebars.markdown','com.github.jknack','handlebars-markdown').version('4.2.1') + + bundle('slf4j', ['slf4j.api', 'log4j.api', 'log4j.core', 'log4j.slf4j']) + bundle('slf4j.test', ['slf4j.jdk14']) + bundle('prometheus', ['prometheus', 'prometheus-servlet', 'prometheus-server', 'prometheus-hotspot']) + bundle('mariadb', ['mariadb-client', 'hikaricp']) + bundle('nlp', ['stanford.corenlp', 'opennlp']) + bundle('selenium', ['selenium.chrome', 'selenium.java']) + bundle('handlebars', ['handlebars', 'handlebars.markdown']) + + bundle('gson', ['gson', 'gson-type-adapter']) + bundle('httpcomponents', ['httpcomponents.core', 'httpcomponents.client']) + + bundle('junit', ['junit.jupiter', 'junit.jupiter.engine']) + } + + + } +} \ No newline at end of file diff --git a/third_party/README.md b/third-party/README.md similarity index 92% rename from third_party/README.md rename to third-party/README.md index bd5f5c85..ccca7382 100644 --- a/third_party/README.md +++ b/third-party/README.md @@ -11,5 +11,6 @@ or lack an artifact, or to override some default that is inappropriate for the t * [OpenZIM](https://github.com/openzim/libzim) - GPL-2.0 * [XZ for Java](https://tukaani.org/xz/) - Public Domain * [GSON](https://github.com/google/gson) - Apache-2.0 +* [SymSpell](https://github.com/wolfgarbe/symspell) - LGPL-3.0 * Stanford OpenNLP - Apache-2.0 * OpenJDK - GPL-2.0 (packaged under jdkoverride) \ No newline at end of file diff --git a/third-party/build.gradle b/third-party/build.gradle new file mode 100644 index 00000000..4128ac2b --- /dev/null +++ b/third-party/build.gradle @@ -0,0 +1,31 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation libs.bundles.nlp + implementation libs.zstd + implementation libs.commons.compress + implementation libs.ffi + implementation libs.databind + implementation libs.bundles.gson + + implementation 'org.apache.opennlp:opennlp-tools:1.9.4' + implementation 'edu.stanford.nlp:stanford-corenlp:4.4.0' + + implementation 'com.github.luben:zstd-jni:1.5.2-2' + implementation 'org.apache.commons:commons-compress:1.21' + implementation 'com.github.jnr:jnr-ffi:2.2.12' + implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.1' + implementation 'com.google.code.gson:gson:2.9.0' +} + +test { + useJUnitPlatform() +} diff --git a/third_party/src/main/java/ca/rmen/porterstemmer/PorterStemmer.java b/third-party/src/main/java/ca/rmen/porterstemmer/PorterStemmer.java similarity index 100% rename from third_party/src/main/java/ca/rmen/porterstemmer/PorterStemmer.java rename to third-party/src/main/java/ca/rmen/porterstemmer/PorterStemmer.java diff --git a/third_party/src/main/java/com/github/datquocnguyen/FWObject.java b/third-party/src/main/java/com/github/datquocnguyen/FWObject.java similarity index 100% rename from third_party/src/main/java/com/github/datquocnguyen/FWObject.java rename to third-party/src/main/java/com/github/datquocnguyen/FWObject.java diff --git a/third_party/src/main/java/com/github/datquocnguyen/InitialTagger.java b/third-party/src/main/java/com/github/datquocnguyen/InitialTagger.java similarity index 100% rename from third_party/src/main/java/com/github/datquocnguyen/InitialTagger.java rename to third-party/src/main/java/com/github/datquocnguyen/InitialTagger.java diff --git a/third_party/src/main/java/com/github/datquocnguyen/Node.java b/third-party/src/main/java/com/github/datquocnguyen/Node.java similarity index 100% rename from third_party/src/main/java/com/github/datquocnguyen/Node.java rename to third-party/src/main/java/com/github/datquocnguyen/Node.java diff --git a/third_party/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java b/third-party/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java similarity index 100% rename from third_party/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java rename to third-party/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java diff --git a/third_party/src/main/java/com/github/datquocnguyen/Utils.java b/third-party/src/main/java/com/github/datquocnguyen/Utils.java similarity index 100% rename from third_party/src/main/java/com/github/datquocnguyen/Utils.java rename to third-party/src/main/java/com/github/datquocnguyen/Utils.java diff --git a/third_party/src/main/java/com/github/datquocnguyen/WordTag.java b/third-party/src/main/java/com/github/datquocnguyen/WordTag.java similarity index 100% rename from third_party/src/main/java/com/github/datquocnguyen/WordTag.java rename to third-party/src/main/java/com/github/datquocnguyen/WordTag.java diff --git a/third_party/src/main/java/com/google/gson/stream/JsonReader.java b/third-party/src/main/java/com/google/gson/stream/JsonReader.java similarity index 100% rename from third_party/src/main/java/com/google/gson/stream/JsonReader.java rename to third-party/src/main/java/com/google/gson/stream/JsonReader.java diff --git a/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java b/third-party/src/main/java/com/upserve/uppend/blobs/NativeIO.java similarity index 100% rename from third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java rename to third-party/src/main/java/com/upserve/uppend/blobs/NativeIO.java diff --git a/third_party/src/main/java/jdkoverride/LargeLineBufferedReader.java b/third-party/src/main/java/jdkoverride/LargeLineBufferedReader.java similarity index 100% rename from third_party/src/main/java/jdkoverride/LargeLineBufferedReader.java rename to third-party/src/main/java/jdkoverride/LargeLineBufferedReader.java diff --git a/third_party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java b/third-party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java similarity index 99% rename from third_party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java rename to third-party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java index 5f87a0b0..5b01869d 100644 --- a/third_party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java +++ b/third-party/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java @@ -177,6 +177,7 @@ public class DefaultSDContextGenerator implements SDContextGenerator { * * @deprecated use {@link #collectFeatures(String, String, String, String, Character)} instead. */ + @Deprecated protected void collectFeatures(String prefix, String suffix, String previous, String next) { collectFeatures(prefix, suffix, previous, next, null); } diff --git a/third_party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/third-party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java similarity index 98% rename from third_party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java rename to third-party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java index ad43bbe8..32112efc 100644 --- a/third_party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java +++ b/third-party/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java @@ -90,6 +90,7 @@ public class SentenceDetectorME implements SentenceDetector { * @deprecated Use a {@link SentenceDetectorFactory} to extend * SentenceDetector functionality. */ + @Deprecated public SentenceDetectorME(SentenceModel model, Factory factory) { this.model = model.getMaxentModel(); // if the model has custom EOS characters set, use this to get the context @@ -137,14 +138,14 @@ public class SentenceDetectorME implements SentenceDetector { } private int getFirstWS(String s, int pos) { - while (pos < s.length() && !StringUtil.isWhitespace(s.charAt(pos))) - pos++; + for (; pos < s.length() && !Character.isWhitespace(s.charAt(pos)); pos++); + return pos; } private int getFirstNonWS(String s, int pos) { - while (pos < s.length() && StringUtil.isWhitespace(s.charAt(pos))) - pos++; + for (; pos < s.length() && Character.isWhitespace(s.charAt(pos)); pos++); + return pos; } @@ -297,6 +298,7 @@ public class SentenceDetectorME implements SentenceDetector { * {@link #train(String, ObjectStream, SentenceDetectorFactory, TrainingParameters)} * and pass in af {@link SentenceDetectorFactory}. */ + @Deprecated public static SentenceModel train(String languageCode, ObjectStream samples, boolean useTokenEnd, Dictionary abbreviations, TrainingParameters mlParams) throws IOException { diff --git a/third_party/src/main/java/org/openzim/ZIMTypes/ArticleEntry.java b/third-party/src/main/java/org/openzim/ZIMTypes/ArticleEntry.java similarity index 100% rename from third_party/src/main/java/org/openzim/ZIMTypes/ArticleEntry.java rename to third-party/src/main/java/org/openzim/ZIMTypes/ArticleEntry.java diff --git a/third_party/src/main/java/org/openzim/ZIMTypes/DirectoryEntry.java b/third-party/src/main/java/org/openzim/ZIMTypes/DirectoryEntry.java similarity index 100% rename from third_party/src/main/java/org/openzim/ZIMTypes/DirectoryEntry.java rename to third-party/src/main/java/org/openzim/ZIMTypes/DirectoryEntry.java diff --git a/third_party/src/main/java/org/openzim/ZIMTypes/RedirectEntry.java b/third-party/src/main/java/org/openzim/ZIMTypes/RedirectEntry.java similarity index 100% rename from third_party/src/main/java/org/openzim/ZIMTypes/RedirectEntry.java rename to third-party/src/main/java/org/openzim/ZIMTypes/RedirectEntry.java diff --git a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMFile.java b/third-party/src/main/java/org/openzim/ZIMTypes/ZIMFile.java similarity index 100% rename from third_party/src/main/java/org/openzim/ZIMTypes/ZIMFile.java rename to third-party/src/main/java/org/openzim/ZIMTypes/ZIMFile.java diff --git a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java b/third-party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java similarity index 98% rename from third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java rename to third-party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java index 3e3bd58f..1f001d36 100644 --- a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java +++ b/third-party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java @@ -20,9 +20,6 @@ package org.openzim.ZIMTypes; import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdInputStream; -import lombok.AllArgsConstructor; -import lombok.Getter; -import org.jetbrains.annotations.NotNull; import org.openzim.util.RandomAcessFileZIMInputStream; import org.openzim.util.Utilities; import org.tukaani.xz.SingleXZInputStream; @@ -204,13 +201,17 @@ public class ZIMReader { } - @Getter @AllArgsConstructor static class DataKey implements Comparable { public final long cluster; public final long blob; + DataKey(long cluster, long blob) { + this.cluster = cluster; + this.blob = blob; + } + @Override - public int compareTo(@NotNull DataKey o) { + public int compareTo(DataKey o) { if (o.cluster != cluster) { return (int)(cluster - o.cluster); } diff --git a/third_party/src/main/java/org/openzim/util/RandomAcessFileZIMInputStream.java b/third-party/src/main/java/org/openzim/util/RandomAcessFileZIMInputStream.java similarity index 100% rename from third_party/src/main/java/org/openzim/util/RandomAcessFileZIMInputStream.java rename to third-party/src/main/java/org/openzim/util/RandomAcessFileZIMInputStream.java diff --git a/third_party/src/main/java/org/openzim/util/Utilities.java b/third-party/src/main/java/org/openzim/util/Utilities.java similarity index 100% rename from third_party/src/main/java/org/openzim/util/Utilities.java rename to third-party/src/main/java/org/openzim/util/Utilities.java diff --git a/third_party/src/main/java/org/tukaani/xz/BlockInputStream.java b/third-party/src/main/java/org/tukaani/xz/BlockInputStream.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/BlockInputStream.java rename to third-party/src/main/java/org/tukaani/xz/BlockInputStream.java diff --git a/third_party/src/main/java/org/tukaani/xz/BlockOutputStream.java b/third-party/src/main/java/org/tukaani/xz/BlockOutputStream.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/BlockOutputStream.java rename to third-party/src/main/java/org/tukaani/xz/BlockOutputStream.java diff --git a/third_party/src/main/java/org/tukaani/xz/CorruptedInputException.java b/third-party/src/main/java/org/tukaani/xz/CorruptedInputException.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/CorruptedInputException.java rename to third-party/src/main/java/org/tukaani/xz/CorruptedInputException.java diff --git a/third_party/src/main/java/org/tukaani/xz/CountingInputStream.java b/third-party/src/main/java/org/tukaani/xz/CountingInputStream.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/CountingInputStream.java rename to third-party/src/main/java/org/tukaani/xz/CountingInputStream.java diff --git a/third_party/src/main/java/org/tukaani/xz/CountingOutputStream.java b/third-party/src/main/java/org/tukaani/xz/CountingOutputStream.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/CountingOutputStream.java rename to third-party/src/main/java/org/tukaani/xz/CountingOutputStream.java diff --git a/third_party/src/main/java/org/tukaani/xz/DeltaCoder.java b/third-party/src/main/java/org/tukaani/xz/DeltaCoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/DeltaCoder.java rename to third-party/src/main/java/org/tukaani/xz/DeltaCoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/DeltaDecoder.java b/third-party/src/main/java/org/tukaani/xz/DeltaDecoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/DeltaDecoder.java rename to third-party/src/main/java/org/tukaani/xz/DeltaDecoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/DeltaInputStream.java b/third-party/src/main/java/org/tukaani/xz/DeltaInputStream.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/DeltaInputStream.java rename to third-party/src/main/java/org/tukaani/xz/DeltaInputStream.java diff --git a/third_party/src/main/java/org/tukaani/xz/FilterCoder.java b/third-party/src/main/java/org/tukaani/xz/FilterCoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/FilterCoder.java rename to third-party/src/main/java/org/tukaani/xz/FilterCoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/FilterDecoder.java b/third-party/src/main/java/org/tukaani/xz/FilterDecoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/FilterDecoder.java rename to third-party/src/main/java/org/tukaani/xz/FilterDecoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/FilterEncoder.java b/third-party/src/main/java/org/tukaani/xz/FilterEncoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/FilterEncoder.java rename to third-party/src/main/java/org/tukaani/xz/FilterEncoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/FilterOptions.java b/third-party/src/main/java/org/tukaani/xz/FilterOptions.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/FilterOptions.java rename to third-party/src/main/java/org/tukaani/xz/FilterOptions.java diff --git a/third_party/src/main/java/org/tukaani/xz/FinishableOutputStream.java b/third-party/src/main/java/org/tukaani/xz/FinishableOutputStream.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/FinishableOutputStream.java rename to third-party/src/main/java/org/tukaani/xz/FinishableOutputStream.java diff --git a/third_party/src/main/java/org/tukaani/xz/IndexIndicatorException.java b/third-party/src/main/java/org/tukaani/xz/IndexIndicatorException.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/IndexIndicatorException.java rename to third-party/src/main/java/org/tukaani/xz/IndexIndicatorException.java diff --git a/third_party/src/main/java/org/tukaani/xz/LZMA2Coder.java b/third-party/src/main/java/org/tukaani/xz/LZMA2Coder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/LZMA2Coder.java rename to third-party/src/main/java/org/tukaani/xz/LZMA2Coder.java diff --git a/third_party/src/main/java/org/tukaani/xz/LZMA2Decoder.java b/third-party/src/main/java/org/tukaani/xz/LZMA2Decoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/LZMA2Decoder.java rename to third-party/src/main/java/org/tukaani/xz/LZMA2Decoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/LZMA2Encoder.java b/third-party/src/main/java/org/tukaani/xz/LZMA2Encoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/LZMA2Encoder.java rename to third-party/src/main/java/org/tukaani/xz/LZMA2Encoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/LZMA2InputStream.java b/third-party/src/main/java/org/tukaani/xz/LZMA2InputStream.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/LZMA2InputStream.java rename to third-party/src/main/java/org/tukaani/xz/LZMA2InputStream.java diff --git a/third_party/src/main/java/org/tukaani/xz/LZMA2Options.java b/third-party/src/main/java/org/tukaani/xz/LZMA2Options.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/LZMA2Options.java rename to third-party/src/main/java/org/tukaani/xz/LZMA2Options.java diff --git a/third_party/src/main/java/org/tukaani/xz/LZMA2OutputStream.java b/third-party/src/main/java/org/tukaani/xz/LZMA2OutputStream.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/LZMA2OutputStream.java rename to third-party/src/main/java/org/tukaani/xz/LZMA2OutputStream.java diff --git a/third_party/src/main/java/org/tukaani/xz/MemoryLimitException.java b/third-party/src/main/java/org/tukaani/xz/MemoryLimitException.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/MemoryLimitException.java rename to third-party/src/main/java/org/tukaani/xz/MemoryLimitException.java diff --git a/third_party/src/main/java/org/tukaani/xz/RawCoder.java b/third-party/src/main/java/org/tukaani/xz/RawCoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/RawCoder.java rename to third-party/src/main/java/org/tukaani/xz/RawCoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/SingleXZInputStream.java b/third-party/src/main/java/org/tukaani/xz/SingleXZInputStream.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/SingleXZInputStream.java rename to third-party/src/main/java/org/tukaani/xz/SingleXZInputStream.java diff --git a/third_party/src/main/java/org/tukaani/xz/UnsupportedOptionsException.java b/third-party/src/main/java/org/tukaani/xz/UnsupportedOptionsException.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/UnsupportedOptionsException.java rename to third-party/src/main/java/org/tukaani/xz/UnsupportedOptionsException.java diff --git a/third_party/src/main/java/org/tukaani/xz/XZ.java b/third-party/src/main/java/org/tukaani/xz/XZ.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/XZ.java rename to third-party/src/main/java/org/tukaani/xz/XZ.java diff --git a/third_party/src/main/java/org/tukaani/xz/XZFormatException.java b/third-party/src/main/java/org/tukaani/xz/XZFormatException.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/XZFormatException.java rename to third-party/src/main/java/org/tukaani/xz/XZFormatException.java diff --git a/third_party/src/main/java/org/tukaani/xz/XZIOException.java b/third-party/src/main/java/org/tukaani/xz/XZIOException.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/XZIOException.java rename to third-party/src/main/java/org/tukaani/xz/XZIOException.java diff --git a/third_party/src/main/java/org/tukaani/xz/XZInputStream.java b/third-party/src/main/java/org/tukaani/xz/XZInputStream.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/XZInputStream.java rename to third-party/src/main/java/org/tukaani/xz/XZInputStream.java diff --git a/third_party/src/main/java/org/tukaani/xz/XZOutputStream.java b/third-party/src/main/java/org/tukaani/xz/XZOutputStream.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/XZOutputStream.java rename to third-party/src/main/java/org/tukaani/xz/XZOutputStream.java diff --git a/third_party/src/main/java/org/tukaani/xz/check/CRC32.java b/third-party/src/main/java/org/tukaani/xz/check/CRC32.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/check/CRC32.java rename to third-party/src/main/java/org/tukaani/xz/check/CRC32.java diff --git a/third_party/src/main/java/org/tukaani/xz/check/CRC64.java b/third-party/src/main/java/org/tukaani/xz/check/CRC64.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/check/CRC64.java rename to third-party/src/main/java/org/tukaani/xz/check/CRC64.java diff --git a/third_party/src/main/java/org/tukaani/xz/check/Check.java b/third-party/src/main/java/org/tukaani/xz/check/Check.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/check/Check.java rename to third-party/src/main/java/org/tukaani/xz/check/Check.java diff --git a/third_party/src/main/java/org/tukaani/xz/check/None.java b/third-party/src/main/java/org/tukaani/xz/check/None.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/check/None.java rename to third-party/src/main/java/org/tukaani/xz/check/None.java diff --git a/third_party/src/main/java/org/tukaani/xz/check/SHA256.java b/third-party/src/main/java/org/tukaani/xz/check/SHA256.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/check/SHA256.java rename to third-party/src/main/java/org/tukaani/xz/check/SHA256.java diff --git a/third_party/src/main/java/org/tukaani/xz/common/DecoderUtil.java b/third-party/src/main/java/org/tukaani/xz/common/DecoderUtil.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/common/DecoderUtil.java rename to third-party/src/main/java/org/tukaani/xz/common/DecoderUtil.java diff --git a/third_party/src/main/java/org/tukaani/xz/common/EncoderUtil.java b/third-party/src/main/java/org/tukaani/xz/common/EncoderUtil.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/common/EncoderUtil.java rename to third-party/src/main/java/org/tukaani/xz/common/EncoderUtil.java diff --git a/third_party/src/main/java/org/tukaani/xz/common/StreamFlags.java b/third-party/src/main/java/org/tukaani/xz/common/StreamFlags.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/common/StreamFlags.java rename to third-party/src/main/java/org/tukaani/xz/common/StreamFlags.java diff --git a/third_party/src/main/java/org/tukaani/xz/common/Util.java b/third-party/src/main/java/org/tukaani/xz/common/Util.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/common/Util.java rename to third-party/src/main/java/org/tukaani/xz/common/Util.java diff --git a/third_party/src/main/java/org/tukaani/xz/delta/DeltaCoder.java b/third-party/src/main/java/org/tukaani/xz/delta/DeltaCoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/delta/DeltaCoder.java rename to third-party/src/main/java/org/tukaani/xz/delta/DeltaCoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/delta/DeltaDecoder.java b/third-party/src/main/java/org/tukaani/xz/delta/DeltaDecoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/delta/DeltaDecoder.java rename to third-party/src/main/java/org/tukaani/xz/delta/DeltaDecoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/index/IndexBase.java b/third-party/src/main/java/org/tukaani/xz/index/IndexBase.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/index/IndexBase.java rename to third-party/src/main/java/org/tukaani/xz/index/IndexBase.java diff --git a/third_party/src/main/java/org/tukaani/xz/index/IndexEncoder.java b/third-party/src/main/java/org/tukaani/xz/index/IndexEncoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/index/IndexEncoder.java rename to third-party/src/main/java/org/tukaani/xz/index/IndexEncoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/index/IndexHash.java b/third-party/src/main/java/org/tukaani/xz/index/IndexHash.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/index/IndexHash.java rename to third-party/src/main/java/org/tukaani/xz/index/IndexHash.java diff --git a/third_party/src/main/java/org/tukaani/xz/index/IndexRecord.java b/third-party/src/main/java/org/tukaani/xz/index/IndexRecord.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/index/IndexRecord.java rename to third-party/src/main/java/org/tukaani/xz/index/IndexRecord.java diff --git a/third_party/src/main/java/org/tukaani/xz/lz/LZDecoder.java b/third-party/src/main/java/org/tukaani/xz/lz/LZDecoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/lz/LZDecoder.java rename to third-party/src/main/java/org/tukaani/xz/lz/LZDecoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/lzma/LZMACoder.java b/third-party/src/main/java/org/tukaani/xz/lzma/LZMACoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/lzma/LZMACoder.java rename to third-party/src/main/java/org/tukaani/xz/lzma/LZMACoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/lzma/LZMADecoder.java b/third-party/src/main/java/org/tukaani/xz/lzma/LZMADecoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/lzma/LZMADecoder.java rename to third-party/src/main/java/org/tukaani/xz/lzma/LZMADecoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/lzma/State.java b/third-party/src/main/java/org/tukaani/xz/lzma/State.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/lzma/State.java rename to third-party/src/main/java/org/tukaani/xz/lzma/State.java diff --git a/third_party/src/main/java/org/tukaani/xz/package-info.java b/third-party/src/main/java/org/tukaani/xz/package-info.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/package-info.java rename to third-party/src/main/java/org/tukaani/xz/package-info.java diff --git a/third_party/src/main/java/org/tukaani/xz/rangecoder/RangeCoder.java b/third-party/src/main/java/org/tukaani/xz/rangecoder/RangeCoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/rangecoder/RangeCoder.java rename to third-party/src/main/java/org/tukaani/xz/rangecoder/RangeCoder.java diff --git a/third_party/src/main/java/org/tukaani/xz/rangecoder/RangeDecoder.java b/third-party/src/main/java/org/tukaani/xz/rangecoder/RangeDecoder.java similarity index 100% rename from third_party/src/main/java/org/tukaani/xz/rangecoder/RangeDecoder.java rename to third-party/src/main/java/org/tukaani/xz/rangecoder/RangeDecoder.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/SymSpell.java b/third-party/src/main/java/symspell/SymSpell.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/SymSpell.java rename to third-party/src/main/java/symspell/SymSpell.java index 2f98254a..45bac274 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/SymSpell.java +++ b/third-party/src/main/java/symspell/SymSpell.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.assistant.dict; +package symspell; import java.io.BufferedReader; diff --git a/tools/screenshot/build.gradle b/tools/screenshot/build.gradle new file mode 100644 index 00000000..f1a2c21a --- /dev/null +++ b/tools/screenshot/build.gradle @@ -0,0 +1,46 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party') + implementation project(':protocol') + implementation project(':common:model') + implementation project(':features:screenshots') + implementation project(':common:service') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + implementation libs.bundles.selenium + implementation libs.bundles.mariadb + implementation libs.notnull + implementation libs.commons.compress + implementation libs.commons.io + implementation libs.guice + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ScreenshotCaptureToolMain.java b/tools/screenshot/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ScreenshotCaptureToolMain.java rename to tools/screenshot/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java index 8ddd1b9d..26b83336 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ScreenshotCaptureToolMain.java +++ b/tools/screenshot/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.tools; +package nu.marginalia.screenshot.tool; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.service.module.DatabaseModule; import org.jetbrains.annotations.NotNull; import org.openqa.selenium.OutputType; import org.openqa.selenium.PageLoadStrategy; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotLoaderMain.java b/tools/screenshot/src/main/java/nu/marginalia/screenshot/ScreenshotLoaderMain.java similarity index 92% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotLoaderMain.java rename to tools/screenshot/src/main/java/nu/marginalia/screenshot/ScreenshotLoaderMain.java index a69420f4..7ca7c320 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotLoaderMain.java +++ b/tools/screenshot/src/main/java/nu/marginalia/screenshot/ScreenshotLoaderMain.java @@ -1,6 +1,7 @@ -package nu.marginalia.wmsa.edge.assistant.screenshot; +package nu.marginalia.screenshot.tool; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; + +import nu.marginalia.service.module.DatabaseModule; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.mariadb.jdbc.Driver;