Create first E2E-test with TestContainers
This commit is contained in:
parent
b45f68fedd
commit
cd3cae0ad5
39 changed files with 961 additions and 255 deletions
|
@ -3,6 +3,7 @@ plugins {
|
|||
id "io.freefair.lombok" version "5.3.3.3"
|
||||
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
}
|
||||
|
||||
repositories {
|
||||
|
@ -24,6 +25,19 @@ repositories {
|
|||
}
|
||||
}
|
||||
|
||||
sourceSets {
|
||||
e2eTest {
|
||||
java {
|
||||
java {
|
||||
compileClasspath += main.output + test.output
|
||||
runtimeClasspath += main.output + test.output
|
||||
srcDir file('src/e2e/java')
|
||||
}
|
||||
resources.srcDir file('src/e2e/resources')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
|
@ -33,16 +47,9 @@ java {
|
|||
dependencies {
|
||||
implementation project(':third_party')
|
||||
|
||||
implementation 'junit:junit:4.13.2'
|
||||
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
|
||||
testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
|
||||
|
||||
implementation 'org.projectlombok:lombok:1.18.22'
|
||||
annotationProcessor 'org.projectlombok:lombok:1.18.22'
|
||||
|
||||
testCompileOnly 'org.projectlombok:lombok:1.18.22'
|
||||
testImplementation 'org.projectlombok:lombok:1.18.22'
|
||||
testAnnotationProcessor 'org.projectlombok:lombok:1.18.22'
|
||||
implementation 'org.projectlombok:lombok:1.18.24'
|
||||
annotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||
|
||||
implementation 'com.github.jknack:handlebars:4.3.0'
|
||||
implementation 'com.github.jknack:handlebars-markdown:4.2.1'
|
||||
|
@ -63,7 +70,7 @@ dependencies {
|
|||
|
||||
implementation 'com.google.guava:guava:31.1-jre'
|
||||
implementation 'com.google.inject:guice:5.1.0'
|
||||
implementation 'com.github.jnr:jnr-ffi:2.1.1'
|
||||
implementation 'com.github.jnr:jnr-ffi:2.2.12'
|
||||
implementation 'org.apache.httpcomponents:httpcore:4.4.15'
|
||||
implementation 'org.apache.httpcomponents:httpclient:4.5.13'
|
||||
implementation 'com.github.ThatJavaNerd:JRAW:1.1.0'
|
||||
|
@ -74,29 +81,23 @@ dependencies {
|
|||
implementation 'org.jsoup:jsoup:1.14.3'
|
||||
implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2'
|
||||
|
||||
implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.3'
|
||||
implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.4'
|
||||
implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3'
|
||||
|
||||
implementation 'com.zaxxer:HikariCP:5.0.1'
|
||||
|
||||
implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
|
||||
implementation 'org.apache.opennlp:opennlp-tools:1.9.3'
|
||||
implementation 'io.prometheus:simpleclient:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient_servlet:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient_hotspot:0.15.0'
|
||||
implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.1'
|
||||
implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
|
||||
implementation 'io.prometheus:simpleclient:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient_servlet:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient_hotspot:0.15.0'
|
||||
implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.1'
|
||||
implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3'
|
||||
|
||||
implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30'
|
||||
|
||||
implementation 'com.syncthemall:boilerpipe:1.2.2'
|
||||
implementation 'com.github.luben:zstd-jni:1.5.2-2'
|
||||
implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.3.0'
|
||||
implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.5.0'
|
||||
implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14'
|
||||
|
||||
implementation 'org.imgscalr:imgscalr-lib:4.2'
|
||||
|
@ -111,10 +112,33 @@ dependencies {
|
|||
implementation 'edu.stanford.nlp:stanford-corenlp:4.4.0'
|
||||
|
||||
implementation group: 'it.unimi.dsi', name: 'fastutil', version: '8.5.8'
|
||||
implementation 'org.roaringbitmap:RoaringBitmap:[0.6,)'
|
||||
implementation 'org.roaringbitmap:RoaringBitmap:0.9.27'
|
||||
implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29'
|
||||
|
||||
implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0'
|
||||
|
||||
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
|
||||
testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
|
||||
testCompileOnly 'org.projectlombok:lombok:1.18.24'
|
||||
testImplementation 'org.projectlombok:lombok:1.18.24'
|
||||
testAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||
|
||||
e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
|
||||
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
|
||||
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
|
||||
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.22'
|
||||
e2eTestImplementation 'org.testcontainers:mariadb:1.17.1'
|
||||
e2eTestImplementation 'org.testcontainers:nginx:1.17.1'
|
||||
e2eTestImplementation 'org.testcontainers:testcontainers:1.17.1'
|
||||
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.1"
|
||||
e2eTestImplementation "org.testcontainers:selenium:1.17.1"
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4'
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
|
||||
}
|
||||
|
||||
configurations {
|
||||
e2eTestImplementation.extendsFrom(testImplementation)
|
||||
|
||||
}
|
||||
|
||||
test {
|
||||
|
@ -136,4 +160,51 @@ task dbTest(type: Test) {
|
|||
}
|
||||
}
|
||||
|
||||
task e2eTest(type: Test) {
|
||||
maxParallelForks = 1
|
||||
forkEvery = 1
|
||||
maxHeapSize = "8G"
|
||||
dependsOn ':shadowJar'
|
||||
dependsOn 'downloadTestData'
|
||||
dependsOn 'downloadRDRModelData'
|
||||
dependsOn 'downloadSentenceModelData'
|
||||
dependsOn 'downloadTokenModelData'
|
||||
dependsOn 'downloadTermFreqData'
|
||||
|
||||
classpath = sourceSets.e2eTest.runtimeClasspath
|
||||
testClassesDirs = sourceSets.e2eTest.output.classesDirs
|
||||
useJUnitPlatform {
|
||||
includeTags "e2e"
|
||||
}
|
||||
}
|
||||
|
||||
task downloadTestData(type: Download) {
|
||||
src 'http://hammurabi.acc.umu.se/mirror/kiwix.org/zim/wikipedia/wikipedia_en_100_nopic_2022-05.zim'
|
||||
dest file('data/test/wikipedia_en_100_nopic.zim')
|
||||
overwrite false
|
||||
}
|
||||
|
||||
task downloadRDRModelData(type: Download) {
|
||||
src (['https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT',
|
||||
'https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR'])
|
||||
dest file('data/models/')
|
||||
overwrite false
|
||||
}
|
||||
|
||||
task downloadSentenceModelData(type: Download) {
|
||||
src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin'
|
||||
dest file('data/models/opennlp-sentence.bin')
|
||||
overwrite false
|
||||
}
|
||||
task downloadTokenModelData(type: Download) {
|
||||
src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin'
|
||||
dest file('data/models/opennlp-tokens.bin')
|
||||
overwrite false
|
||||
}
|
||||
|
||||
task downloadTermFreqData(type: Copy) {
|
||||
// TODO: Need hosting for this file
|
||||
from '/var/lib/wmsa/model/tfreq-new-algo3.bin'
|
||||
into 'data/models'
|
||||
}
|
||||
|
||||
|
|
1
marginalia_nu/data/.gitignore
vendored
Normal file
1
marginalia_nu/data/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
*
|
1
marginalia_nu/data/models/.gitignore
vendored
Normal file
1
marginalia_nu/data/models/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
*
|
1
marginalia_nu/data/test/.gitignore
vendored
Normal file
1
marginalia_nu/data/test/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
*
|
|
@ -0,0 +1,197 @@
|
|||
package nu.marginalia.wmsa.edge;
|
||||
|
||||
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.openqa.selenium.By;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
import org.openzim.ZIMTypes.ZIMReader;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.testcontainers.containers.*;
|
||||
import org.testcontainers.containers.output.Slf4jLogConsumer;
|
||||
import org.testcontainers.containers.wait.strategy.Wait;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.MountableFile;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
|
||||
import static org.testcontainers.containers.BrowserWebDriverContainer.VncRecordingMode.RECORD_ALL;
|
||||
|
||||
@Tag("e2e")
|
||||
@Testcontainers
|
||||
public class EdgeSearchE2ETest {
|
||||
Network network = Network.newNetwork();
|
||||
|
||||
@Container
|
||||
public GenericContainer<?> mariaDB = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withInitScript("sql/edge-crawler-cache.sql")
|
||||
.withNetwork(network)
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
@Container
|
||||
public GenericContainer<?> searchContainer = forService(EDGE_SEARCH);
|
||||
@Container
|
||||
public GenericContainer<?> assistantContainer = forService(EDGE_ASSISTANT);
|
||||
@Container
|
||||
public GenericContainer<?> indexContainer = forService(EDGE_INDEX);
|
||||
|
||||
@Container
|
||||
public NginxContainer<?> mockWikipedia = new NginxContainer<>("nginx:stable")
|
||||
.dependsOn(searchContainer)
|
||||
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("wikipedia")))
|
||||
.withFileSystemBind(getWikipediaFiles(), "/usr/share/nginx/html/", BindMode.READ_ONLY)
|
||||
.withNetwork(network)
|
||||
.withNetworkAliases("wikipedia");
|
||||
|
||||
|
||||
@Container
|
||||
public BrowserWebDriverContainer<?> chrome = new BrowserWebDriverContainer<>()
|
||||
.withNetwork(network)
|
||||
.withCapabilities(new ChromeOptions());
|
||||
|
||||
@Container
|
||||
public GenericContainer<?> crawlerContainer = new GenericContainer<>("openjdk:17-alpine")
|
||||
.dependsOn(mockWikipedia)
|
||||
.dependsOn(indexContainer)
|
||||
.withNetwork(network)
|
||||
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler")))
|
||||
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
|
||||
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
|
||||
.withCopyFileToContainer(MountableFile.forClasspathResource("crawl.sh"), "/crawl.sh")
|
||||
.withFileSystemBind(getCrawlPath().toString(), "/crawl/", BindMode.READ_WRITE)
|
||||
.withCommand("sh", "crawl.sh")
|
||||
.waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10)));
|
||||
|
||||
@Container
|
||||
public NginxContainer<?> proxyNginx = new NginxContainer<>("nginx:stable")
|
||||
.dependsOn(searchContainer)
|
||||
.dependsOn(crawlerContainer)
|
||||
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx")))
|
||||
.withCopyFileToContainer(MountableFile.forClasspathResource("nginx/search.conf"), "/etc/nginx/conf.d/default.conf")
|
||||
.withNetwork(network)
|
||||
.withNetworkAliases("proxyNginx");
|
||||
;
|
||||
public GenericContainer<?> forService(ServiceDescriptor service) {
|
||||
return new GenericContainer<>("openjdk:17-alpine")
|
||||
.dependsOn(mariaDB)
|
||||
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
|
||||
.withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh")
|
||||
.withExposedPorts(service.port)
|
||||
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
|
||||
.withNetwork(network)
|
||||
.withNetworkAliases(service.name)
|
||||
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name)))
|
||||
.withCommand("sh", "init.sh", service.name)
|
||||
.waitingFor(Wait.forHttp("/internal/ping")
|
||||
.forPort(service.port)
|
||||
.withReadTimeout(Duration.ofSeconds(15)))
|
||||
;
|
||||
}
|
||||
|
||||
public static MountableFile jarFile() {
|
||||
Path cwd = Path.of(System.getProperty("user.dir"));
|
||||
|
||||
cwd = cwd.resolve("..");
|
||||
var jarFile = cwd.resolve("build/libs/wmsa-SNAPSHOT-all.jar");
|
||||
if (!Files.exists(jarFile)) {
|
||||
System.err.println("Could not find jarFile " + jarFile);
|
||||
throw new RuntimeException();
|
||||
}
|
||||
else {
|
||||
System.out.println("jar file = " + jarFile);
|
||||
}
|
||||
return MountableFile.forHostPath(jarFile);
|
||||
}
|
||||
|
||||
public static String modelsPath() {
|
||||
Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models");
|
||||
if (!Files.isDirectory(modelsPath)) {
|
||||
System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath());
|
||||
throw new RuntimeException();
|
||||
}
|
||||
return modelsPath.toString();
|
||||
}
|
||||
|
||||
private Path getCrawlPath() {
|
||||
return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl");
|
||||
}
|
||||
|
||||
private String getWikipediaFiles() {
|
||||
Path wikipediaFiles = Path.of(System.getProperty("user.dir")).resolve("build/tmp/wikipedia");
|
||||
Path crawlFiles = getCrawlPath();
|
||||
Path zimFile = Path.of(System.getProperty("user.dir")).resolve("data/test/wikipedia_en_100_nopic.zim");
|
||||
|
||||
|
||||
List<String> urls = new ArrayList<>();
|
||||
try {
|
||||
Files.deleteIfExists(wikipediaFiles);
|
||||
Files.createDirectories(wikipediaFiles);
|
||||
Files.createDirectories(crawlFiles);
|
||||
|
||||
Files.writeString(crawlFiles.resolve("crawl.plan"), """
|
||||
jobSpec: "/crawl/crawl.spec"
|
||||
crawl:
|
||||
dir: "/crawl/crawl"
|
||||
logName: "crawl.log"
|
||||
process:
|
||||
dir: "/crawl/process"
|
||||
logName: "process.log"
|
||||
""");
|
||||
|
||||
Files.createDirectories(crawlFiles.resolve("crawl"));
|
||||
Files.createDirectories(crawlFiles.resolve("process"));
|
||||
Files.deleteIfExists(crawlFiles.resolve("process").resolve("process.log"));
|
||||
Files.deleteIfExists(crawlFiles.resolve("crawl").resolve("crawl.log"));
|
||||
|
||||
var zr = new ZIMReader(new ZIMFile(zimFile.toString()));
|
||||
zr.forEachArticles((url, art) -> {
|
||||
urls.add("http://wikipedia/" + url + ".html");
|
||||
|
||||
if (art != null) {
|
||||
try {
|
||||
var doc = Jsoup.parse(art);
|
||||
doc.getElementsByTag("script").remove();
|
||||
Files.writeString(wikipediaFiles.resolve(url+".html"), doc.html());
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}, pred -> true);
|
||||
urls.forEach(System.out::println);
|
||||
Files.writeString(wikipediaFiles.resolve("index.html"), "<html/>");
|
||||
CrawlJobExtractorMain.writeSpec(crawlFiles.resolve("crawl.spec"), "wikipedia", urls);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
return wikipediaFiles.toString();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void run() {
|
||||
var driver = chrome.getWebDriver();
|
||||
|
||||
driver.get("http://proxyNginx/");
|
||||
System.out.println(driver.getTitle());
|
||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
|
||||
driver.get("http://proxyNginx/search?query=bird&profile=corpo");
|
||||
System.out.println(driver.getTitle());
|
||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
}
|
||||
}
|
78
marginalia_nu/src/e2e/resources/crawl.sh
Normal file
78
marginalia_nu/src/e2e/resources/crawl.sh
Normal file
|
@ -0,0 +1,78 @@
|
|||
#!/bin/bash
|
||||
mkdir -p /var/lib/wmsa/conf/
|
||||
|
||||
cat > /var/lib/wmsa/db.properties <<EOF
|
||||
db.user=wmsa
|
||||
db.pass=wmsa
|
||||
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||
EOF
|
||||
|
||||
cat > /var/lib/wmsa/conf/hosts <<EOF
|
||||
# service-name host-name
|
||||
resource-store resource-store
|
||||
data-store data-store
|
||||
renderer renderer
|
||||
auth auth
|
||||
api api
|
||||
smhi-scraper smhi-scraper
|
||||
podcast-scraper podcast-scraper
|
||||
edge-crawler edge-crawler
|
||||
edge-index edge-index
|
||||
edge-director edge-director
|
||||
edge-search edge-search
|
||||
edge-archive edge-archive
|
||||
edge-assistant edge-assistant
|
||||
memex memex
|
||||
dating dating
|
||||
EOF
|
||||
|
||||
|
||||
cat crawl/crawl.plan
|
||||
cat << EOF
|
||||
#### ##### ## # # #
|
||||
# # # # # # # # #
|
||||
# # # # # # # #
|
||||
# ##### ###### # ## # #
|
||||
# # # # # # ## ## #
|
||||
#### # # # # # # ######
|
||||
EOF
|
||||
java -DdefaultCrawlDelay=1 -cp WMSA.jar nu.marginalia.wmsa.edge.crawling.CrawlerMain crawl/crawl.plan
|
||||
|
||||
cat <<EOF
|
||||
|
||||
#### #### # # # # ###### ##### #####
|
||||
# # # # ## # # # # # # #
|
||||
# # # # # # # # ##### # # #
|
||||
# # # # # # # # # ##### #
|
||||
# # # # # ## # # # # # #
|
||||
#### #### # # ## ###### # # #
|
||||
|
||||
EOF
|
||||
java -cp WMSA.jar nu.marginalia.wmsa.edge.converting.ConverterMain crawl/crawl.plan
|
||||
cat <<EOF
|
||||
|
||||
# #### ## #####
|
||||
# # # # # # #
|
||||
# # # # # # #
|
||||
# # # ###### # #
|
||||
# # # # # # #
|
||||
###### #### # # #####
|
||||
|
||||
EOF
|
||||
java -Dkeyword-index=0 -cp WMSA.jar nu.marginalia.wmsa.edge.converting.LoaderMain crawl/crawl.plan
|
||||
|
||||
chmod -R 777 crawl/
|
||||
|
||||
cat <<EOF
|
||||
|
||||
##### ##### # #### #### ###### #####
|
||||
# # # # # # # # # # #
|
||||
# # # # # # ##### # #
|
||||
# ##### # # ### # ### # #####
|
||||
# # # # # # # # # # #
|
||||
# # # # #### #### ###### # #
|
||||
|
||||
EOF
|
||||
java -cp WMSA.jar nu.marginalia.wmsa.edge.converting.ReindexTriggerMain edge-index
|
||||
|
||||
echo "ALL DONE"
|
61
marginalia_nu/src/e2e/resources/init.sh
Normal file
61
marginalia_nu/src/e2e/resources/init.sh
Normal file
|
@ -0,0 +1,61 @@
|
|||
#!/bin/bash
|
||||
|
||||
mkdir -p /var/lib/wmsa/conf
|
||||
mkdir -p /var/lib/wmsa/index/write
|
||||
mkdir -p /var/lib/wmsa/index/read
|
||||
mkdir -p /backup/work/index-tmp
|
||||
|
||||
mkdir -p /var/log/wmsa
|
||||
cat > /var/lib/wmsa/suggestions.txt <<EOF
|
||||
state
|
||||
three
|
||||
while
|
||||
used
|
||||
university
|
||||
can
|
||||
united
|
||||
under
|
||||
known
|
||||
season
|
||||
many
|
||||
year
|
||||
EOF
|
||||
|
||||
cat > /var/lib/wmsa/db.properties <<EOF
|
||||
db.user=wmsa
|
||||
db.pass=wmsa
|
||||
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||
EOF
|
||||
|
||||
cat > /var/lib/wmsa/conf/ranking-settings.yaml <<EOF
|
||||
---
|
||||
retro:
|
||||
- "%"
|
||||
small:
|
||||
- "%"
|
||||
academia:
|
||||
- "%edu"
|
||||
standard:
|
||||
- "%"
|
||||
EOF
|
||||
|
||||
cat > /var/lib/wmsa/conf/hosts <<EOF
|
||||
# service-name host-name
|
||||
resource-store resource-store
|
||||
data-store data-store
|
||||
renderer renderer
|
||||
auth auth
|
||||
api api
|
||||
smhi-scraper smhi-scraper
|
||||
podcast-scraper podcast-scraper
|
||||
edge-crawler edge-crawler
|
||||
edge-index edge-index
|
||||
edge-director edge-director
|
||||
edge-search edge-search
|
||||
edge-archive edge-archive
|
||||
edge-assistant edge-assistant
|
||||
memex memex
|
||||
dating dating
|
||||
EOF
|
||||
|
||||
java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
|
15
marginalia_nu/src/e2e/resources/log4j2.properties
Normal file
15
marginalia_nu/src/e2e/resources/log4j2.properties
Normal file
|
@ -0,0 +1,15 @@
|
|||
|
||||
status = info
|
||||
|
||||
appender.console.type = Console
|
||||
appender.console.name = LogToConsole
|
||||
appender.console.layout.type = PatternLayout
|
||||
appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg%n
|
||||
|
||||
logger.console.name = nu.marginalia
|
||||
logger.console.level = debug
|
||||
logger.console.additivity = false
|
||||
logger.console.appenderRef.rolling.ref = LogToConsole
|
||||
|
||||
rootLogger.level = info
|
||||
rootLogger.appenderRef.console.ref = LogToConsole
|
25
marginalia_nu/src/e2e/resources/nginx/search.conf
Normal file
25
marginalia_nu/src/e2e/resources/nginx/search.conf
Normal file
|
@ -0,0 +1,25 @@
|
|||
server {
|
||||
listen 80;
|
||||
listen [::]:80;
|
||||
server_name nginx;
|
||||
|
||||
location /search {
|
||||
if ( $request_method = POST ) {
|
||||
return 444;
|
||||
}
|
||||
|
||||
proxy_set_header X-Context $remote_addr-$connection;
|
||||
proxy_set_header X-Public "1";
|
||||
proxy_set_header X-Extern-Url $scheme://$host$request_uri;
|
||||
proxy_set_header X-Extern-Domain $scheme://$host;
|
||||
proxy_set_header X-User-Agent $http_user_agent;
|
||||
|
||||
proxy_pass http://edge-search:5023/public/search;
|
||||
tcp_nodelay on;
|
||||
}
|
||||
|
||||
location / {
|
||||
proxy_pass http://edge-search:5023/;
|
||||
tcp_nodelay on;
|
||||
}
|
||||
}
|
|
@ -49,7 +49,7 @@ public abstract class AbstractClient implements AutoCloseable {
|
|||
private final Thread livenessMonitor;
|
||||
|
||||
public AbstractClient(String host, int port, int timeout) {
|
||||
logger.info("Creating client for {}", getClass().getSimpleName());
|
||||
logger.info("Creating client for {}[{}:{}]", getClass().getSimpleName(), host, port);
|
||||
|
||||
this.timeout = timeout;
|
||||
client = new OkHttpClient.Builder()
|
||||
|
|
|
@ -15,7 +15,7 @@ public class AbstractDynamicClient extends AbstractClient {
|
|||
private final AbortingScheduler scheduler;
|
||||
|
||||
public AbstractDynamicClient(@Nonnull ServiceDescriptor service) {
|
||||
super("localhost", service.port, 10);
|
||||
super(service.getHost(), service.port, 10);
|
||||
|
||||
this.service = service;
|
||||
this.scheduler = new AbortingScheduler(name());
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
package nu.marginalia.wmsa.configuration;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/** Mappings file between ServiceDescriptor.name and host
|
||||
*
|
||||
* */
|
||||
public class HostsFile {
|
||||
private final Map<ServiceDescriptor, String> hostsMap = new HashMap<>(ServiceDescriptor.values().length);
|
||||
|
||||
public HostsFile(Path fileName) throws IOException {
|
||||
var lines = Files.readAllLines(fileName);
|
||||
for (var line : lines) {
|
||||
if (line.startsWith("#") || line.isBlank()) {
|
||||
continue;
|
||||
}
|
||||
String[] parts = line.strip().split(" ");
|
||||
if (parts.length != 2) throw new IllegalArgumentException("Invalid hosts file entry " + line);
|
||||
String descriptorName = parts[0];
|
||||
String hostName = parts[1];
|
||||
|
||||
try {
|
||||
hostsMap.put(ServiceDescriptor.byName(descriptorName), hostName);
|
||||
}
|
||||
catch (IllegalArgumentException ex) {
|
||||
throw new IllegalArgumentException("ServiceDescriptor " + descriptorName + " invalid");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public HostsFile() {
|
||||
for (var sd : ServiceDescriptor.values()) {
|
||||
hostsMap.put(sd, "localhost");
|
||||
}
|
||||
}
|
||||
|
||||
public String getHost(ServiceDescriptor sd) {
|
||||
return hostsMap.get(sd);
|
||||
}
|
||||
|
||||
}
|
|
@ -21,6 +21,9 @@ import nu.marginalia.wmsa.resource_store.ResourceStoreMain;
|
|||
import nu.marginalia.wmsa.smhi.scraper.SmhiScraperMain;
|
||||
import org.apache.logging.log4j.core.lookup.MainMapLookup;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
@ -49,13 +52,21 @@ public enum ServiceDescriptor {
|
|||
TEST_1("test-1", 0, null),
|
||||
TEST_2("test-2", 0, null);
|
||||
|
||||
private static HostsFile hostsFile;
|
||||
public synchronized String getHost() {
|
||||
if (hostsFile == null) {
|
||||
hostsFile = WmsaHome.getHostsFile();
|
||||
}
|
||||
return hostsFile.getHost(this);
|
||||
}
|
||||
|
||||
public static ServiceDescriptor byName(String name) {
|
||||
for (var v : values()) {
|
||||
if (v.name.equals(name)) {
|
||||
return v;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException(name);
|
||||
throw new IllegalArgumentException("Invalid ServiceDescriptor " + name);
|
||||
}
|
||||
public final String name;
|
||||
public final Class<?> mainClass;
|
||||
|
|
|
@ -1,16 +1,31 @@
|
|||
package nu.marginalia.wmsa.configuration;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class WmsaHome {
|
||||
private static final String DEFAULT = "/var/lib/wmsa";
|
||||
|
||||
public static Path get() {
|
||||
public static Path getHomePath() {
|
||||
var ret = Path.of(System.getProperty("WMSA_HOME", DEFAULT));
|
||||
if (!Files.isDirectory(ret)) {
|
||||
throw new IllegalStateException("Could not find WMSA_HOME, either set environment variable or ensure " + DEFAULT + " exists");
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static HostsFile getHostsFile() {
|
||||
Path hostsFile = getHomePath().resolve("conf/hosts");
|
||||
if (Files.isRegularFile(hostsFile)) {
|
||||
try {
|
||||
return new HostsFile(hostsFile);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Failed to load hosts file " + hostsFile, e);
|
||||
}
|
||||
}
|
||||
else {
|
||||
return new HostsFile();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ public class DatabaseModule extends AbstractModule {
|
|||
}
|
||||
|
||||
private Properties loadDbProperties() {
|
||||
Path propDir = WmsaHome.get().resolve("db.properties");
|
||||
Path propDir = WmsaHome.getHomePath().resolve("db.properties");
|
||||
if (!Files.isRegularFile(propDir)) {
|
||||
throw new IllegalStateException("Database properties file " + propDir + " does not exist");
|
||||
}
|
||||
|
|
|
@ -33,7 +33,7 @@ public class LoaderMain {
|
|||
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
|
||||
private final LoaderFactory loaderFactory;
|
||||
private final EdgeIndexClient indexClient;
|
||||
private final boolean running = true;
|
||||
private volatile boolean running = true;
|
||||
|
||||
final Thread processorThread = new Thread(this::processor, "Processor Thread");
|
||||
|
||||
|
@ -82,8 +82,11 @@ public class LoaderMain {
|
|||
load(entry.path(), entry.cnt());
|
||||
});
|
||||
|
||||
running = false;
|
||||
processorThread.join();
|
||||
indexClient.close();
|
||||
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
private volatile static int loadTotal;
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import okhttp3.MediaType;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import okhttp3.RequestBody;
|
||||
import okio.BufferedSink;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
|
||||
|
||||
public class ReindexTriggerMain {
|
||||
|
||||
public static void main(String... args) throws IOException, SQLException {
|
||||
var db = new DatabaseModule();
|
||||
var client = new OkHttpClient.Builder()
|
||||
.connectTimeout(100, TimeUnit.MILLISECONDS)
|
||||
.readTimeout(15, TimeUnit.MINUTES)
|
||||
.retryOnConnectionFailure(true)
|
||||
.followRedirects(true)
|
||||
.build();
|
||||
|
||||
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
|
||||
while (rs.next()) {
|
||||
System.out.printf("%d %s %s %d\n",
|
||||
rs.getInt(1),
|
||||
rs.getString(2),
|
||||
rs.getString(3),
|
||||
rs.getInt(4));
|
||||
}
|
||||
|
||||
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100");
|
||||
while (rs.next()) {
|
||||
System.out.printf("%d %d %s %d %s\n",
|
||||
rs.getInt(1),
|
||||
rs.getInt(2),
|
||||
rs.getString(3),
|
||||
rs.getInt(4),
|
||||
rs.getString(5));
|
||||
|
||||
}
|
||||
|
||||
stmt.executeUpdate("INSERT IGNORE INTO DOMAIN_METADATA(ID,GOOD_URLS,KNOWN_URLS,VISITED_URLS) SELECT ID,0,0,0 FROM EC_DOMAIN WHERE INDEXED>0");
|
||||
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED AND STATE='ok' GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET GOOD_URLS=CNT");
|
||||
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET KNOWN_URLS=CNT");
|
||||
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET VISITED_URLS=CNT");
|
||||
}
|
||||
|
||||
var rb = new RequestBody() {
|
||||
|
||||
@Nullable
|
||||
@Override
|
||||
public MediaType contentType() {
|
||||
return MediaType.parse("text/plain");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(BufferedSink sink) throws IOException {
|
||||
sink.writeString("NOOP", Charset.defaultCharset());
|
||||
}
|
||||
};
|
||||
|
||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute();
|
||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute();
|
||||
for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) {
|
||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -25,6 +25,8 @@ public class IndexLoadKeywords implements Runnable {
|
|||
private final Thread runThread;
|
||||
private volatile boolean canceled = false;
|
||||
|
||||
private static final int index = Integer.getInteger("keyword-index", 1);
|
||||
|
||||
@Inject
|
||||
public IndexLoadKeywords(EdgeIndexClient client) {
|
||||
this.client = client;
|
||||
|
@ -37,7 +39,7 @@ public class IndexLoadKeywords implements Runnable {
|
|||
while (!canceled) {
|
||||
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
||||
if (data != null) {
|
||||
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, 1).blockingSubscribe();
|
||||
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -79,15 +79,21 @@ public class DocumentProcessor {
|
|||
ret.url = new EdgeUrl(crawledDocument.url);
|
||||
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
||||
|
||||
if (ret.state == EdgeUrlState.OK && isAcceptedContentType(crawledDocument)) {
|
||||
var detailsWords = createDetails(crawledDomain, crawledDocument);
|
||||
if (ret.state == EdgeUrlState.OK) {
|
||||
|
||||
if (detailsWords.details().quality < minDocumentQuality) {
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
if (isAcceptedContentType(crawledDocument)) {
|
||||
var detailsWords = createDetails(crawledDomain, crawledDocument);
|
||||
|
||||
if (detailsWords.details().quality < minDocumentQuality) {
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
}
|
||||
|
||||
ret.details = detailsWords.details();
|
||||
ret.words = detailsWords.words();
|
||||
}
|
||||
else {
|
||||
throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE);
|
||||
}
|
||||
|
||||
ret.details = detailsWords.details();
|
||||
ret.words = detailsWords.words();
|
||||
}
|
||||
else {
|
||||
throw new DisqualifiedException(DisqualificationReason.STATUS);
|
||||
|
@ -95,7 +101,7 @@ public class DocumentProcessor {
|
|||
}
|
||||
catch (DisqualifiedException ex) {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
|
||||
logger.info("Disqualified {}: {}", ret.url, ex.reason);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
|
|
|
@ -73,7 +73,7 @@ public class CrawlJobExtractorMain {
|
|||
private final EdgeDomainBlacklistImpl blacklist;
|
||||
|
||||
private final Connection conn;
|
||||
private final HashFunction hasher = Hashing.murmur3_128(0);
|
||||
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||
|
||||
public static void main(String... args) throws SQLException, IOException {
|
||||
Driver driver = new Driver();
|
||||
|
@ -97,6 +97,19 @@ public class CrawlJobExtractorMain {
|
|||
}
|
||||
}
|
||||
|
||||
public static void writeSpec(Path outFile, String domain, List<String> urls) throws IOException {
|
||||
Gson gson = new GsonBuilder().create();
|
||||
|
||||
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
|
||||
var job = new CrawlingSpecification();
|
||||
job.crawlDepth = urls.size();
|
||||
job.domain = domain;
|
||||
job.id = createId(new EdgeDomain(domain));
|
||||
job.urls = urls;
|
||||
out.println(gson.toJson(job));
|
||||
}
|
||||
}
|
||||
|
||||
private record DomainWithId(String domainName, int id) {}
|
||||
|
||||
private Stream<CrawlingSpecification> extractDomains() {
|
||||
|
@ -186,11 +199,11 @@ public class CrawlJobExtractorMain {
|
|||
return spec;
|
||||
}
|
||||
|
||||
private String createId(DomainWithId domainWithId) {
|
||||
private static String createId(DomainWithId domainWithId) {
|
||||
return hasher.hashUnencodedChars(domainWithId.domainName).toString();
|
||||
}
|
||||
|
||||
private String createId(EdgeDomain domain) {
|
||||
private static String createId(EdgeDomain domain) {
|
||||
return hasher.hashUnencodedChars(domain.toString()).toString();
|
||||
}
|
||||
|
||||
|
|
|
@ -79,6 +79,9 @@ public class CrawlerMain implements AutoCloseable {
|
|||
try (var crawler = new CrawlerMain(plan)) {
|
||||
crawler.run();
|
||||
}
|
||||
|
||||
// TODO (2022-05-24): Some thread isn't set to daemon mode, need to explicitly harakiri the process, find why?
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
private CrawledDomain fetchDomain(CrawlingSpecification specification) {
|
||||
|
|
|
@ -20,7 +20,7 @@ import java.time.LocalDateTime;
|
|||
import java.util.*;
|
||||
|
||||
public class CrawlerRetreiver {
|
||||
private static final long DEFAULT_CRAWL_DELAY_MS = 1000;
|
||||
private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000);
|
||||
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
|
||||
private final HttpFetcher fetcher;
|
||||
private final HashSet<EdgeUrl> visited;
|
||||
|
|
|
@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.index;
|
|||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException;
|
||||
|
||||
|
||||
public class EdgeIndexControl {
|
||||
|
@ -19,11 +20,15 @@ public class EdgeIndexControl {
|
|||
System.gc();
|
||||
|
||||
for (IndexBlock block : IndexBlock.values()) {
|
||||
try {
|
||||
servicesFactory.getIndexConverter(id, block);
|
||||
|
||||
servicesFactory.getIndexConverter(id, block);
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
}
|
||||
catch (ConversionUnnecessaryException unnecessary) {
|
||||
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
}
|
||||
}
|
||||
|
||||
System.runFinalization();
|
||||
|
|
|
@ -13,12 +13,18 @@ public class EdgeIndexModule extends AbstractModule {
|
|||
|
||||
|
||||
public void configure() {
|
||||
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
|
||||
if (Boolean.getBoolean("small-ram")) {
|
||||
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27);
|
||||
}
|
||||
else {
|
||||
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Provides
|
||||
public RankingSettings rankingSettings() {
|
||||
Path dir = WmsaHome.get().resolve("conf/ranking-settings.yaml");
|
||||
Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml");
|
||||
return RankingSettings.from(dir);
|
||||
}
|
||||
|
||||
|
|
|
@ -88,8 +88,8 @@ public class IndexServicesFactory {
|
|||
return new DictionaryReader(getDictionaryWriter());
|
||||
|
||||
}
|
||||
@SneakyThrows
|
||||
public SearchIndexConverter getIndexConverter(int id, IndexBlock block) {
|
||||
|
||||
public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException {
|
||||
return new SearchIndexConverter(block, id, tmpFileDir,
|
||||
preconverterOutputFile.get(id),
|
||||
indexWriteWordsFile.get(id, block.id),
|
||||
|
@ -146,14 +146,17 @@ public class IndexServicesFactory {
|
|||
public Callable<Boolean> switchFilesJob(int id) {
|
||||
return () -> {
|
||||
for (int block = 0; block < IndexBlock.values().length; block++) {
|
||||
Files.move(
|
||||
indexWriteWordsFile.get(id, block).toPath(),
|
||||
indexReadWordsFile.get(id, block).toPath(),
|
||||
StandardCopyOption.REPLACE_EXISTING);
|
||||
Files.move(
|
||||
indexWriteUrlsFile.get(id, block).toPath(),
|
||||
indexReadUrlsFile.get(id, block).toPath(),
|
||||
StandardCopyOption.REPLACE_EXISTING);
|
||||
if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) &&
|
||||
Files.exists(indexWriteUrlsFile.get(id, block).toPath())) {
|
||||
Files.move(
|
||||
indexWriteWordsFile.get(id, block).toPath(),
|
||||
indexReadWordsFile.get(id, block).toPath(),
|
||||
StandardCopyOption.REPLACE_EXISTING);
|
||||
Files.move(
|
||||
indexWriteUrlsFile.get(id, block).toPath(),
|
||||
indexReadUrlsFile.get(id, block).toPath(),
|
||||
StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
package nu.marginalia.wmsa.edge.index.service.index;
|
||||
|
||||
public class ConversionUnnecessaryException extends Exception {
|
||||
public ConversionUnnecessaryException() {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
|
||||
}
|
|
@ -61,6 +61,7 @@ public class SearchIndexConverter {
|
|||
@Named("edge-index-write-urls-file") File outputFileUrls,
|
||||
SearchIndexPartitioner partitioner,
|
||||
EdgeDomainBlacklist blacklist)
|
||||
throws ConversionUnnecessaryException
|
||||
{
|
||||
this.block = block;
|
||||
this.bucketId = bucketId;
|
||||
|
@ -77,16 +78,21 @@ public class SearchIndexConverter {
|
|||
this.fileLength = raf.readLong();
|
||||
this.wordCount = raf.readInt();
|
||||
|
||||
if (fileLength <= FILE_HEADER_SIZE) {
|
||||
throw new ConversionUnnecessaryException();
|
||||
}
|
||||
|
||||
var inputChannel = raf.getChannel();
|
||||
|
||||
ByteBuffer buffer = ByteBuffer.allocateDirect(10_000);
|
||||
|
||||
urlsFileSize = getUrlsSize(buffer, raf);
|
||||
urlsFileSize = getUrlsSize(buffer, inputChannel);
|
||||
|
||||
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
||||
|
||||
|
||||
var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
|
||||
urlsTmpFileChannel = new RandomAccessFile(tmpUrlsFile.toFile(), "rw").getChannel();
|
||||
urlsTmpFileChannel = urlsTmpFileRaf.getChannel();
|
||||
urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false);
|
||||
urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, 1024*1024*256);
|
||||
|
||||
|
@ -114,6 +120,139 @@ public class SearchIndexConverter {
|
|||
}
|
||||
|
||||
|
||||
private long getUrlsSize(ByteBuffer buffer, FileChannel channel) throws IOException {
|
||||
channel.position(FILE_HEADER_SIZE);
|
||||
|
||||
var reader = new IndexReader(buffer, channel) {
|
||||
public long size;
|
||||
|
||||
@Override
|
||||
public void eachWord(long urlId, int wordId) {
|
||||
size++;
|
||||
}
|
||||
};
|
||||
|
||||
reader.read();
|
||||
|
||||
logger.info("Blacklist filtered {} URLs", reader.filtered);
|
||||
logger.debug("URLs Size {} Mb", channel.position()/(1024*1024));
|
||||
|
||||
return reader.size;
|
||||
}
|
||||
|
||||
private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException {
|
||||
logger.debug("Table size = {}", wordIndexTable.length);
|
||||
int[] wordIndex = new int[wordIndexTable.length];
|
||||
raf.seek(FILE_HEADER_SIZE);
|
||||
|
||||
var channel = raf.getChannel();
|
||||
|
||||
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) {
|
||||
var reader = new IndexReader(buffer, channel) {
|
||||
@Override
|
||||
public void eachWord(long urlId, int wordId) throws IOException {
|
||||
if (wordId >= wordIndex.length)
|
||||
return;
|
||||
|
||||
if (wordId != 0) {
|
||||
if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) {
|
||||
logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}",
|
||||
wordId,
|
||||
wordIndex[wordId],
|
||||
wordIndexTable[wordId - 1],
|
||||
wordIndexTable[wordId]);
|
||||
throw new IllegalStateException();
|
||||
}
|
||||
}
|
||||
if (wordId > 0) {
|
||||
rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId));
|
||||
} else {
|
||||
rwf.put(wordIndex[wordId]++, translateUrl(urlId));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
reader.read();
|
||||
|
||||
rwf.write(urlsTmpFileChannel);
|
||||
}
|
||||
|
||||
urlsTmpFileChannel.force(false);
|
||||
|
||||
logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024));
|
||||
|
||||
if (wordIndexTable.length > 0) {
|
||||
logger.debug("Sorting urls table");
|
||||
sortUrls(wordIndexTable);
|
||||
urlsTmpFileMap.force();
|
||||
}
|
||||
else {
|
||||
logger.warn("urls table empty -- nothing to sort");
|
||||
}
|
||||
|
||||
|
||||
long idx = 0;
|
||||
|
||||
try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) {
|
||||
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
|
||||
|
||||
if (wordIndexTable[0] != 0) {
|
||||
int start = 0;
|
||||
int end = (int) wordIndexTable[0];
|
||||
|
||||
idx += writer.write(idx, (int) wordIndexTable[0],
|
||||
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
|
||||
}
|
||||
|
||||
for (int i = 1; i < wordIndexTable.length; i++) {
|
||||
if (wordIndexTable[i] != wordIndexTable[i - 1]) {
|
||||
long start = wordIndexTable[i-1];
|
||||
long end = wordIndexTable[i];
|
||||
|
||||
idx += writer.write(idx, (int) (end-start),
|
||||
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void sortUrls(long[] wordIndices) {
|
||||
urlTmpFileSorter.sort( 0, (int) wordIndices[0]);
|
||||
|
||||
for (int i = 1; i < wordIndices.length; i++) {
|
||||
urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1]));
|
||||
}
|
||||
}
|
||||
|
||||
private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception {
|
||||
inputChannel.position(FILE_HEADER_SIZE);
|
||||
|
||||
logger.debug("Table size = {}", wordCount);
|
||||
WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount);
|
||||
ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE);
|
||||
|
||||
logger.debug("Reading words");
|
||||
|
||||
var reader = new IndexReader(buffer, inputChannel) {
|
||||
@Override
|
||||
public void eachWord(long urlId, int wordId) {
|
||||
wordsTableWriter.acceptWord(wordId);
|
||||
}
|
||||
};
|
||||
reader.read();
|
||||
|
||||
logger.debug("Rearranging table");
|
||||
|
||||
inputChannel.position(FILE_HEADER_SIZE);
|
||||
|
||||
wordsTableWriter.write(outputFileWords);
|
||||
|
||||
return wordsTableWriter.getTable();
|
||||
}
|
||||
|
||||
@RequiredArgsConstructor
|
||||
private class IndexReader {
|
||||
private final ByteBuffer buffer;
|
||||
|
@ -193,7 +332,7 @@ public class SearchIndexConverter {
|
|||
public void eachUrl(Lock lock, int count, long urlId) throws IOException {
|
||||
for (int i = 0; i < count; i++) {
|
||||
int wordId = buffer.getInt();
|
||||
if (acceptWord(lock, urlId, wordId, i, block.id)) {
|
||||
if (acceptWord(lock, urlId)) {
|
||||
eachWord(urlId, wordId);
|
||||
}
|
||||
}
|
||||
|
@ -201,183 +340,16 @@ public class SearchIndexConverter {
|
|||
public void eachWord(long urlId, int wordId) throws IOException {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private long getUrlsSize(ByteBuffer buffer, RandomAccessFile raf) throws IOException {
|
||||
raf.seek(FILE_HEADER_SIZE);
|
||||
boolean acceptWord(Lock lock, long urlId) {
|
||||
int domainId = (int) (urlId >>> 32L);
|
||||
|
||||
var channel = raf.getChannel();
|
||||
|
||||
var reader = new IndexReader(buffer, channel) {
|
||||
public long size;
|
||||
|
||||
@Override
|
||||
public void eachWord(long urlId, int wordId) {
|
||||
size++;
|
||||
}
|
||||
};
|
||||
|
||||
reader.read();
|
||||
|
||||
logger.info("Blacklist filtered {} URLs", reader.filtered);
|
||||
logger.debug("URLs Size {} Mb", channel.position()/(1024*1024));
|
||||
|
||||
return reader.size;
|
||||
}
|
||||
|
||||
private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException {
|
||||
logger.debug("Table size = {}", wordIndexTable.length);
|
||||
int[] wordIndex = new int[wordIndexTable.length];
|
||||
raf.seek(FILE_HEADER_SIZE);
|
||||
|
||||
var channel = raf.getChannel();
|
||||
|
||||
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) {
|
||||
var reader = new IndexReader(buffer, channel) {
|
||||
@Override
|
||||
public void eachWord(long urlId, int wordId) throws IOException {
|
||||
if (wordId >= wordIndex.length)
|
||||
return;
|
||||
|
||||
if (wordId != 0) {
|
||||
if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) {
|
||||
logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}",
|
||||
wordId,
|
||||
wordIndex[wordId],
|
||||
wordIndexTable[wordId - 1],
|
||||
wordIndexTable[wordId]);
|
||||
throw new IllegalStateException();
|
||||
}
|
||||
}
|
||||
if (wordId > 0) {
|
||||
rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId));
|
||||
} else {
|
||||
rwf.put(wordIndex[wordId]++, translateUrl(urlId));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
reader.read();
|
||||
|
||||
rwf.write(urlsTmpFileChannel);
|
||||
}
|
||||
|
||||
urlsTmpFileChannel.force(false);
|
||||
|
||||
logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024));
|
||||
|
||||
if (wordIndexTable.length > 0) {
|
||||
logger.debug("Sorting urls table");
|
||||
sortUrls(wordIndexTable);
|
||||
urlsTmpFileMap.force();
|
||||
}
|
||||
else {
|
||||
logger.warn("urls table empty -- nothing to sort");
|
||||
}
|
||||
|
||||
|
||||
long idx = 0;
|
||||
|
||||
var copyBuffer = ByteBuffer.allocateDirect(4096);
|
||||
try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) {
|
||||
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
|
||||
|
||||
if (wordIndexTable[0] != 0) {
|
||||
int start = 0;
|
||||
int end = (int) wordIndexTable[0];
|
||||
|
||||
idx += writer.write(idx, (int) wordIndexTable[0],
|
||||
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
|
||||
if (!partitioner.filterUnsafe(lock, domainId, bucketId)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 1; i < wordIndexTable.length; i++) {
|
||||
if (wordIndexTable[i] != wordIndexTable[i - 1]) {
|
||||
long start = wordIndexTable[i-1];
|
||||
long end = wordIndexTable[i];
|
||||
|
||||
idx += writer.write(idx, (int) (end-start),
|
||||
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return true;
|
||||
}
|
||||
|
||||
logger.warn("BTrees generated");
|
||||
}
|
||||
|
||||
public void transfer(ByteBuffer buffer, MultimapFileLong dest, FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
|
||||
int tbw = 0;
|
||||
|
||||
buffer.limit(Math.min(buffer.capacity(), (int)(sourceEnd - sourceStart)*8));
|
||||
while (sourceEnd - sourceStart - tbw > buffer.limit()/8) {
|
||||
int bw = 0;
|
||||
while (buffer.position() < buffer.limit()) {
|
||||
int r = sourceChannel.read(buffer, sourceStart*8 + bw);
|
||||
if (r < 0) {
|
||||
throw new IOException("");
|
||||
}
|
||||
bw += r;
|
||||
}
|
||||
buffer.flip();
|
||||
dest.write(buffer.asLongBuffer(), destOffset + tbw);
|
||||
tbw += bw/8;
|
||||
buffer.clear();
|
||||
buffer.limit(Math.min(buffer.capacity(), (int)(sourceEnd*8 - sourceStart*8 - tbw)));
|
||||
}
|
||||
buffer.clear();
|
||||
buffer.limit((int)(sourceEnd - (sourceStart + tbw))*8);
|
||||
int bw = 0;
|
||||
while (bw < buffer.limit()) {
|
||||
bw += sourceChannel.read(buffer, sourceStart + bw);
|
||||
}
|
||||
buffer.flip();
|
||||
dest.write(buffer.asLongBuffer(), destOffset + tbw);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void sortUrls(long[] wordIndices) {
|
||||
urlTmpFileSorter.sort( 0, (int) wordIndices[0]);
|
||||
|
||||
for (int i = 1; i < wordIndices.length; i++) {
|
||||
urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1]));
|
||||
}
|
||||
}
|
||||
|
||||
private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception {
|
||||
inputChannel.position(FILE_HEADER_SIZE);
|
||||
|
||||
logger.debug("Table size = {}", wordCount);
|
||||
WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount);
|
||||
ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE);
|
||||
|
||||
logger.debug("Reading words");
|
||||
|
||||
var reader = new IndexReader(buffer, inputChannel) {
|
||||
@Override
|
||||
public void eachWord(long urlId, int wordId) {
|
||||
wordsTableWriter.acceptWord(wordId);
|
||||
}
|
||||
};
|
||||
reader.read();
|
||||
|
||||
logger.debug("Rearranging table");
|
||||
|
||||
inputChannel.position(FILE_HEADER_SIZE);
|
||||
|
||||
wordsTableWriter.write(outputFileWords);
|
||||
|
||||
return wordsTableWriter.getTable();
|
||||
}
|
||||
|
||||
boolean acceptWord(Lock lock, long urlId, int wordId, int wordIdx, int block) {
|
||||
int domainId = (int) (urlId >>> 32L);
|
||||
|
||||
if (!partitioner.filterUnsafe(lock, domainId, bucketId)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,7 +18,14 @@ DROP VIEW IF EXISTS EC_URL_PART_HASH;
|
|||
|
||||
DROP TABLE IF EXISTS EC_URL_WORD;
|
||||
DROP TABLE IF EXISTS EC_DICTIONARY;
|
||||
DROP TABLE IF EXISTS DOMAIN_METADATA;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
|
||||
ID INT PRIMARY KEY,
|
||||
KNOWN_URLS INT DEFAULT 0,
|
||||
VISITED_URLS INT DEFAULT 0,
|
||||
GOOD_URLS INT DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS EC_TOP_DOMAIN (
|
||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||
|
|
|
@ -1,8 +0,0 @@
|
|||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class EmptyTest {
|
||||
@Test
|
||||
public void test() {
|
||||
|
||||
}
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
package nu.marginalia.util;
|
||||
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
|
||||
|
||||
import java.nio.file.Files;
|
||||
|
@ -7,10 +8,9 @@ import java.nio.file.Path;
|
|||
import java.util.Optional;
|
||||
|
||||
public class TestLanguageModels {
|
||||
private static final Path LANGUAGE_MODELS_DEFAULT = Path.of("/home/vlofgren/Work/ngrams/");
|
||||
|
||||
public static LanguageModels getLanguageModels() {
|
||||
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
|
||||
|
||||
public static Path getLanguageModelsPath() {
|
||||
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
|
||||
.map(Path::of)
|
||||
.orElse(LANGUAGE_MODELS_DEFAULT);
|
||||
|
@ -18,14 +18,20 @@ public class TestLanguageModels {
|
|||
if (!Files.isDirectory(languageModelsHome)) {
|
||||
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
|
||||
}
|
||||
return languageModelsHome;
|
||||
}
|
||||
|
||||
public static LanguageModels getLanguageModels() {
|
||||
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("ngrams-generous-emstr.bin"),
|
||||
languageModelsHome.resolve("tfreq-generous-emstr.bin"),
|
||||
languageModelsHome.resolve("opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tok.bin")
|
||||
languageModelsHome.resolve("opennlp-tokens.bin")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,8 +18,13 @@ public class TestUtil {
|
|||
|
||||
@SneakyThrows
|
||||
public static HikariDataSource getConnection() {
|
||||
return getConnection("jdbc:mysql://localhost:3306/WMSA_test");
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public static HikariDataSource getConnection(String connString) {
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl("jdbc:mysql://localhost:3306/WMSA_test");
|
||||
config.setJdbcUrl(connString);
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
config.setMaximumPoolSize(16);
|
||||
|
@ -29,6 +34,7 @@ public class TestUtil {
|
|||
|
||||
return new HikariDataSource(config);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public static void evalScript(HikariDataSource hds, String scriptFile) {
|
||||
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
package nu.marginalia.wmsa.configuration;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
class HostsFileTest {
|
||||
Path tempFile;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseSunnyDay() throws IOException {
|
||||
Files.writeString(tempFile, """
|
||||
# Comment
|
||||
edge-index 192.168.0.1
|
||||
edge-search 192.168.1.1
|
||||
|
||||
auth 127.0.0.55
|
||||
|
||||
|
||||
""");
|
||||
var hf = new HostsFile(tempFile);
|
||||
|
||||
Assertions.assertEquals("192.168.0.1", hf.getHost(ServiceDescriptor.EDGE_INDEX));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTooLong() throws IOException {
|
||||
Files.writeString(tempFile, """
|
||||
edge-index 192.168.0.1 this is where my homie lives
|
||||
""");
|
||||
|
||||
assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTooShort() throws IOException {
|
||||
Files.writeString(tempFile, """
|
||||
edge-index
|
||||
""");
|
||||
|
||||
assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadName() throws IOException {
|
||||
Files.writeString(tempFile, """
|
||||
garum-factory 127.0.0.1
|
||||
""");
|
||||
|
||||
assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile));
|
||||
}
|
||||
}
|
|
@ -42,7 +42,7 @@ class DictionaryWriterTest {
|
|||
System.out.println(hitsTotal);
|
||||
}
|
||||
*/
|
||||
@Test @Disabled
|
||||
@Test @Disabled @SneakyThrows
|
||||
public void convert() {
|
||||
new SearchIndexConverter(IndexBlock.Title, 0, Path.of("/tmp"),
|
||||
new File("/home/vlofgren/page-index-0.dat"),
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package nu.marginalia.wmsa.edge.index.service;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter;
|
||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
||||
|
@ -16,7 +17,7 @@ class SearchIndexConverterTest {
|
|||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Test @Disabled
|
||||
@Test @Disabled @SneakyThrows
|
||||
public void test() {
|
||||
// File dictFile = new File("/home/vlofgren/dictionary.dat");
|
||||
File inFile = new File("/home/vlofgren/Work/converter/3/page-index.dat");
|
||||
|
|
|
@ -63,7 +63,7 @@ class SearchIndexWriterTest {
|
|||
return reader.findWord(block, budget, lv->true, dictionaryWriter.getReadOnly(word)).stream().toArray();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test @SneakyThrows
|
||||
void put() throws IOException {
|
||||
writer.put(new EdgeId<>(0), new EdgeId<>(1), IndexBlock.Words, Arrays.asList("Hello", "Salvete", "everyone!", "This", "is", "Bob"));
|
||||
writer.put(new EdgeId<>(0), new EdgeId<>(2), IndexBlock.Words, Arrays.asList("Salvete", "omnes!", "Bob", "sum", "Hello"));
|
||||
|
|
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.search.query;
|
|||
import nu.marginalia.util.TestLanguageModels;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
|
@ -17,7 +17,7 @@ class BodyQueryParserTest {
|
|||
private static EnglishDictionary englishDictionary;
|
||||
private static final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
@BeforeClass
|
||||
@BeforeAll
|
||||
public static void init() {
|
||||
dict = new NGramDict(lm);
|
||||
englishDictionary = new EnglishDictionary(dict);
|
||||
|
|
|
@ -12,8 +12,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes;
|
|||
import nu.marginalia.wmsa.memex.system.MemexFileWriter;
|
||||
import nu.marginalia.wmsa.memex.system.MemexGitRepo;
|
||||
import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
@ -40,7 +40,7 @@ class GemtextChangeTest {
|
|||
|
||||
static final Logger logger = LoggerFactory.getLogger(GemtextChangeTest.class);
|
||||
|
||||
@BeforeClass
|
||||
@BeforeAll
|
||||
public static void init() {
|
||||
|
||||
RxJavaPlugins.setErrorHandler(e -> {
|
||||
|
|
|
@ -14,8 +14,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes;
|
|||
import nu.marginalia.wmsa.memex.system.MemexFileWriter;
|
||||
import nu.marginalia.wmsa.memex.system.MemexGitRepo;
|
||||
import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
@ -47,7 +47,7 @@ class GemtextTaskUpdateTest {
|
|||
|
||||
static final Logger logger = LoggerFactory.getLogger(GemtextTaskUpdateTest.class);
|
||||
|
||||
@BeforeClass
|
||||
@BeforeAll
|
||||
public static void init() {
|
||||
|
||||
RxJavaPlugins.setErrorHandler(e -> {
|
||||
|
|
|
@ -11,8 +11,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes;
|
|||
import nu.marginalia.wmsa.memex.system.MemexFileWriter;
|
||||
import nu.marginalia.wmsa.memex.system.MemexGitRepo;
|
||||
import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
@ -38,7 +38,7 @@ class GemtextTombstoneUpdateCaclulatorTest {
|
|||
|
||||
static final Logger logger = LoggerFactory.getLogger(GemtextTombstoneUpdateCaclulatorTest.class);
|
||||
|
||||
@BeforeClass
|
||||
@BeforeAll
|
||||
public static void init() {
|
||||
|
||||
RxJavaPlugins.setErrorHandler(e -> {
|
||||
|
|
Loading…
Reference in a new issue