Create first E2E-test with TestContainers

This commit is contained in:
Viktor Lofgren 2022-05-25 18:02:19 +02:00
parent b45f68fedd
commit cd3cae0ad5
39 changed files with 961 additions and 255 deletions

View file

@ -3,6 +3,7 @@ plugins {
id "io.freefair.lombok" version "5.3.3.3"
id "me.champeau.jmh" version "0.6.6"
id "de.undercouch.download" version "5.1.0"
}
repositories {
@ -24,6 +25,19 @@ repositories {
}
}
sourceSets {
e2eTest {
java {
java {
compileClasspath += main.output + test.output
runtimeClasspath += main.output + test.output
srcDir file('src/e2e/java')
}
resources.srcDir file('src/e2e/resources')
}
}
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
@ -33,16 +47,9 @@ java {
dependencies {
implementation project(':third_party')
implementation 'junit:junit:4.13.2'
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
implementation 'org.projectlombok:lombok:1.18.22'
annotationProcessor 'org.projectlombok:lombok:1.18.22'
testCompileOnly 'org.projectlombok:lombok:1.18.22'
testImplementation 'org.projectlombok:lombok:1.18.22'
testAnnotationProcessor 'org.projectlombok:lombok:1.18.22'
implementation 'org.projectlombok:lombok:1.18.24'
annotationProcessor 'org.projectlombok:lombok:1.18.24'
implementation 'com.github.jknack:handlebars:4.3.0'
implementation 'com.github.jknack:handlebars-markdown:4.2.1'
@ -63,7 +70,7 @@ dependencies {
implementation 'com.google.guava:guava:31.1-jre'
implementation 'com.google.inject:guice:5.1.0'
implementation 'com.github.jnr:jnr-ffi:2.1.1'
implementation 'com.github.jnr:jnr-ffi:2.2.12'
implementation 'org.apache.httpcomponents:httpcore:4.4.15'
implementation 'org.apache.httpcomponents:httpclient:4.5.13'
implementation 'com.github.ThatJavaNerd:JRAW:1.1.0'
@ -74,29 +81,23 @@ dependencies {
implementation 'org.jsoup:jsoup:1.14.3'
implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2'
implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.3'
implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.4'
implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3'
implementation 'com.zaxxer:HikariCP:5.0.1'
implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
implementation 'org.apache.opennlp:opennlp-tools:1.9.3'
implementation 'io.prometheus:simpleclient:0.15.0'
implementation 'io.prometheus:simpleclient_servlet:0.15.0'
implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
implementation 'io.prometheus:simpleclient_hotspot:0.15.0'
implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.1'
implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
implementation 'io.prometheus:simpleclient:0.15.0'
implementation 'io.prometheus:simpleclient_servlet:0.15.0'
implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
implementation 'io.prometheus:simpleclient_hotspot:0.15.0'
implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.1'
implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3'
implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30'
implementation 'com.syncthemall:boilerpipe:1.2.2'
implementation 'com.github.luben:zstd-jni:1.5.2-2'
implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.3.0'
implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.5.0'
implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14'
implementation 'org.imgscalr:imgscalr-lib:4.2'
@ -111,10 +112,33 @@ dependencies {
implementation 'edu.stanford.nlp:stanford-corenlp:4.4.0'
implementation group: 'it.unimi.dsi', name: 'fastutil', version: '8.5.8'
implementation 'org.roaringbitmap:RoaringBitmap:[0.6,)'
implementation 'org.roaringbitmap:RoaringBitmap:0.9.27'
implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29'
implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0'
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
testCompileOnly 'org.projectlombok:lombok:1.18.24'
testImplementation 'org.projectlombok:lombok:1.18.24'
testAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.22'
e2eTestImplementation 'org.testcontainers:mariadb:1.17.1'
e2eTestImplementation 'org.testcontainers:nginx:1.17.1'
e2eTestImplementation 'org.testcontainers:testcontainers:1.17.1'
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.1"
e2eTestImplementation "org.testcontainers:selenium:1.17.1"
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
}
configurations {
e2eTestImplementation.extendsFrom(testImplementation)
}
test {
@ -136,4 +160,51 @@ task dbTest(type: Test) {
}
}
task e2eTest(type: Test) {
maxParallelForks = 1
forkEvery = 1
maxHeapSize = "8G"
dependsOn ':shadowJar'
dependsOn 'downloadTestData'
dependsOn 'downloadRDRModelData'
dependsOn 'downloadSentenceModelData'
dependsOn 'downloadTokenModelData'
dependsOn 'downloadTermFreqData'
classpath = sourceSets.e2eTest.runtimeClasspath
testClassesDirs = sourceSets.e2eTest.output.classesDirs
useJUnitPlatform {
includeTags "e2e"
}
}
task downloadTestData(type: Download) {
src 'http://hammurabi.acc.umu.se/mirror/kiwix.org/zim/wikipedia/wikipedia_en_100_nopic_2022-05.zim'
dest file('data/test/wikipedia_en_100_nopic.zim')
overwrite false
}
task downloadRDRModelData(type: Download) {
src (['https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT',
'https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR'])
dest file('data/models/')
overwrite false
}
task downloadSentenceModelData(type: Download) {
src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin'
dest file('data/models/opennlp-sentence.bin')
overwrite false
}
task downloadTokenModelData(type: Download) {
src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin'
dest file('data/models/opennlp-tokens.bin')
overwrite false
}
task downloadTermFreqData(type: Copy) {
// TODO: Need hosting for this file
from '/var/lib/wmsa/model/tfreq-new-algo3.bin'
into 'data/models'
}

1
marginalia_nu/data/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*

1
marginalia_nu/data/models/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*

1
marginalia_nu/data/test/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*

View file

@ -0,0 +1,197 @@
package nu.marginalia.wmsa.edge;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.openqa.selenium.By;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
import org.slf4j.LoggerFactory;
import org.testcontainers.containers.*;
import org.testcontainers.containers.output.Slf4jLogConsumer;
import org.testcontainers.containers.wait.strategy.Wait;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.MountableFile;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
import static org.testcontainers.containers.BrowserWebDriverContainer.VncRecordingMode.RECORD_ALL;
@Tag("e2e")
@Testcontainers
public class EdgeSearchE2ETest {
Network network = Network.newNetwork();
@Container
public GenericContainer<?> mariaDB = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withInitScript("sql/edge-crawler-cache.sql")
.withNetwork(network)
.withNetworkAliases("mariadb");
@Container
public GenericContainer<?> searchContainer = forService(EDGE_SEARCH);
@Container
public GenericContainer<?> assistantContainer = forService(EDGE_ASSISTANT);
@Container
public GenericContainer<?> indexContainer = forService(EDGE_INDEX);
@Container
public NginxContainer<?> mockWikipedia = new NginxContainer<>("nginx:stable")
.dependsOn(searchContainer)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("wikipedia")))
.withFileSystemBind(getWikipediaFiles(), "/usr/share/nginx/html/", BindMode.READ_ONLY)
.withNetwork(network)
.withNetworkAliases("wikipedia");
@Container
public BrowserWebDriverContainer<?> chrome = new BrowserWebDriverContainer<>()
.withNetwork(network)
.withCapabilities(new ChromeOptions());
@Container
public GenericContainer<?> crawlerContainer = new GenericContainer<>("openjdk:17-alpine")
.dependsOn(mockWikipedia)
.dependsOn(indexContainer)
.withNetwork(network)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler")))
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
.withCopyFileToContainer(MountableFile.forClasspathResource("crawl.sh"), "/crawl.sh")
.withFileSystemBind(getCrawlPath().toString(), "/crawl/", BindMode.READ_WRITE)
.withCommand("sh", "crawl.sh")
.waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10)));
@Container
public NginxContainer<?> proxyNginx = new NginxContainer<>("nginx:stable")
.dependsOn(searchContainer)
.dependsOn(crawlerContainer)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx")))
.withCopyFileToContainer(MountableFile.forClasspathResource("nginx/search.conf"), "/etc/nginx/conf.d/default.conf")
.withNetwork(network)
.withNetworkAliases("proxyNginx");
;
public GenericContainer<?> forService(ServiceDescriptor service) {
return new GenericContainer<>("openjdk:17-alpine")
.dependsOn(mariaDB)
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
.withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh")
.withExposedPorts(service.port)
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
.withNetwork(network)
.withNetworkAliases(service.name)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name)))
.withCommand("sh", "init.sh", service.name)
.waitingFor(Wait.forHttp("/internal/ping")
.forPort(service.port)
.withReadTimeout(Duration.ofSeconds(15)))
;
}
public static MountableFile jarFile() {
Path cwd = Path.of(System.getProperty("user.dir"));
cwd = cwd.resolve("..");
var jarFile = cwd.resolve("build/libs/wmsa-SNAPSHOT-all.jar");
if (!Files.exists(jarFile)) {
System.err.println("Could not find jarFile " + jarFile);
throw new RuntimeException();
}
else {
System.out.println("jar file = " + jarFile);
}
return MountableFile.forHostPath(jarFile);
}
public static String modelsPath() {
Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models");
if (!Files.isDirectory(modelsPath)) {
System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath());
throw new RuntimeException();
}
return modelsPath.toString();
}
private Path getCrawlPath() {
return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl");
}
private String getWikipediaFiles() {
Path wikipediaFiles = Path.of(System.getProperty("user.dir")).resolve("build/tmp/wikipedia");
Path crawlFiles = getCrawlPath();
Path zimFile = Path.of(System.getProperty("user.dir")).resolve("data/test/wikipedia_en_100_nopic.zim");
List<String> urls = new ArrayList<>();
try {
Files.deleteIfExists(wikipediaFiles);
Files.createDirectories(wikipediaFiles);
Files.createDirectories(crawlFiles);
Files.writeString(crawlFiles.resolve("crawl.plan"), """
jobSpec: "/crawl/crawl.spec"
crawl:
dir: "/crawl/crawl"
logName: "crawl.log"
process:
dir: "/crawl/process"
logName: "process.log"
""");
Files.createDirectories(crawlFiles.resolve("crawl"));
Files.createDirectories(crawlFiles.resolve("process"));
Files.deleteIfExists(crawlFiles.resolve("process").resolve("process.log"));
Files.deleteIfExists(crawlFiles.resolve("crawl").resolve("crawl.log"));
var zr = new ZIMReader(new ZIMFile(zimFile.toString()));
zr.forEachArticles((url, art) -> {
urls.add("http://wikipedia/" + url + ".html");
if (art != null) {
try {
var doc = Jsoup.parse(art);
doc.getElementsByTag("script").remove();
Files.writeString(wikipediaFiles.resolve(url+".html"), doc.html());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}, pred -> true);
urls.forEach(System.out::println);
Files.writeString(wikipediaFiles.resolve("index.html"), "<html/>");
CrawlJobExtractorMain.writeSpec(crawlFiles.resolve("crawl.spec"), "wikipedia", urls);
}
catch (IOException ex) {
ex.printStackTrace();
}
return wikipediaFiles.toString();
}
@Test
public void run() {
var driver = chrome.getWebDriver();
driver.get("http://proxyNginx/");
System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
driver.get("http://proxyNginx/search?query=bird&profile=corpo");
System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
}
}

View file

@ -0,0 +1,78 @@
#!/bin/bash
mkdir -p /var/lib/wmsa/conf/
cat > /var/lib/wmsa/db.properties <<EOF
db.user=wmsa
db.pass=wmsa
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
EOF
cat > /var/lib/wmsa/conf/hosts <<EOF
# service-name host-name
resource-store resource-store
data-store data-store
renderer renderer
auth auth
api api
smhi-scraper smhi-scraper
podcast-scraper podcast-scraper
edge-crawler edge-crawler
edge-index edge-index
edge-director edge-director
edge-search edge-search
edge-archive edge-archive
edge-assistant edge-assistant
memex memex
dating dating
EOF
cat crawl/crawl.plan
cat << EOF
#### ##### ## # # #
# # # # # # # # #
# # # # # # # #
# ##### ###### # ## # #
# # # # # # ## ## #
#### # # # # # # ######
EOF
java -DdefaultCrawlDelay=1 -cp WMSA.jar nu.marginalia.wmsa.edge.crawling.CrawlerMain crawl/crawl.plan
cat <<EOF
#### #### # # # # ###### ##### #####
# # # # ## # # # # # # #
# # # # # # # # ##### # # #
# # # # # # # # # ##### #
# # # # # ## # # # # # #
#### #### # # ## ###### # # #
EOF
java -cp WMSA.jar nu.marginalia.wmsa.edge.converting.ConverterMain crawl/crawl.plan
cat <<EOF
# #### ## #####
# # # # # # #
# # # # # # #
# # # ###### # #
# # # # # # #
###### #### # # #####
EOF
java -Dkeyword-index=0 -cp WMSA.jar nu.marginalia.wmsa.edge.converting.LoaderMain crawl/crawl.plan
chmod -R 777 crawl/
cat <<EOF
##### ##### # #### #### ###### #####
# # # # # # # # # # #
# # # # # # ##### # #
# ##### # # ### # ### # #####
# # # # # # # # # # #
# # # # #### #### ###### # #
EOF
java -cp WMSA.jar nu.marginalia.wmsa.edge.converting.ReindexTriggerMain edge-index
echo "ALL DONE"

View file

@ -0,0 +1,61 @@
#!/bin/bash
mkdir -p /var/lib/wmsa/conf
mkdir -p /var/lib/wmsa/index/write
mkdir -p /var/lib/wmsa/index/read
mkdir -p /backup/work/index-tmp
mkdir -p /var/log/wmsa
cat > /var/lib/wmsa/suggestions.txt <<EOF
state
three
while
used
university
can
united
under
known
season
many
year
EOF
cat > /var/lib/wmsa/db.properties <<EOF
db.user=wmsa
db.pass=wmsa
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
EOF
cat > /var/lib/wmsa/conf/ranking-settings.yaml <<EOF
---
retro:
- "%"
small:
- "%"
academia:
- "%edu"
standard:
- "%"
EOF
cat > /var/lib/wmsa/conf/hosts <<EOF
# service-name host-name
resource-store resource-store
data-store data-store
renderer renderer
auth auth
api api
smhi-scraper smhi-scraper
podcast-scraper podcast-scraper
edge-crawler edge-crawler
edge-index edge-index
edge-director edge-director
edge-search edge-search
edge-archive edge-archive
edge-assistant edge-assistant
memex memex
dating dating
EOF
java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1

View file

@ -0,0 +1,15 @@
status = info
appender.console.type = Console
appender.console.name = LogToConsole
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg%n
logger.console.name = nu.marginalia
logger.console.level = debug
logger.console.additivity = false
logger.console.appenderRef.rolling.ref = LogToConsole
rootLogger.level = info
rootLogger.appenderRef.console.ref = LogToConsole

View file

@ -0,0 +1,25 @@
server {
listen 80;
listen [::]:80;
server_name nginx;
location /search {
if ( $request_method = POST ) {
return 444;
}
proxy_set_header X-Context $remote_addr-$connection;
proxy_set_header X-Public "1";
proxy_set_header X-Extern-Url $scheme://$host$request_uri;
proxy_set_header X-Extern-Domain $scheme://$host;
proxy_set_header X-User-Agent $http_user_agent;
proxy_pass http://edge-search:5023/public/search;
tcp_nodelay on;
}
location / {
proxy_pass http://edge-search:5023/;
tcp_nodelay on;
}
}

View file

@ -49,7 +49,7 @@ public abstract class AbstractClient implements AutoCloseable {
private final Thread livenessMonitor;
public AbstractClient(String host, int port, int timeout) {
logger.info("Creating client for {}", getClass().getSimpleName());
logger.info("Creating client for {}[{}:{}]", getClass().getSimpleName(), host, port);
this.timeout = timeout;
client = new OkHttpClient.Builder()

View file

@ -15,7 +15,7 @@ public class AbstractDynamicClient extends AbstractClient {
private final AbortingScheduler scheduler;
public AbstractDynamicClient(@Nonnull ServiceDescriptor service) {
super("localhost", service.port, 10);
super(service.getHost(), service.port, 10);
this.service = service;
this.scheduler = new AbortingScheduler(name());

View file

@ -0,0 +1,45 @@
package nu.marginalia.wmsa.configuration;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
/** Mappings file between ServiceDescriptor.name and host
*
* */
public class HostsFile {
private final Map<ServiceDescriptor, String> hostsMap = new HashMap<>(ServiceDescriptor.values().length);
public HostsFile(Path fileName) throws IOException {
var lines = Files.readAllLines(fileName);
for (var line : lines) {
if (line.startsWith("#") || line.isBlank()) {
continue;
}
String[] parts = line.strip().split(" ");
if (parts.length != 2) throw new IllegalArgumentException("Invalid hosts file entry " + line);
String descriptorName = parts[0];
String hostName = parts[1];
try {
hostsMap.put(ServiceDescriptor.byName(descriptorName), hostName);
}
catch (IllegalArgumentException ex) {
throw new IllegalArgumentException("ServiceDescriptor " + descriptorName + " invalid");
}
}
}
public HostsFile() {
for (var sd : ServiceDescriptor.values()) {
hostsMap.put(sd, "localhost");
}
}
public String getHost(ServiceDescriptor sd) {
return hostsMap.get(sd);
}
}

View file

@ -21,6 +21,9 @@ import nu.marginalia.wmsa.resource_store.ResourceStoreMain;
import nu.marginalia.wmsa.smhi.scraper.SmhiScraperMain;
import org.apache.logging.log4j.core.lookup.MainMapLookup;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@ -49,13 +52,21 @@ public enum ServiceDescriptor {
TEST_1("test-1", 0, null),
TEST_2("test-2", 0, null);
private static HostsFile hostsFile;
public synchronized String getHost() {
if (hostsFile == null) {
hostsFile = WmsaHome.getHostsFile();
}
return hostsFile.getHost(this);
}
public static ServiceDescriptor byName(String name) {
for (var v : values()) {
if (v.name.equals(name)) {
return v;
}
}
throw new IllegalArgumentException(name);
throw new IllegalArgumentException("Invalid ServiceDescriptor " + name);
}
public final String name;
public final Class<?> mainClass;

View file

@ -1,16 +1,31 @@
package nu.marginalia.wmsa.configuration;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class WmsaHome {
private static final String DEFAULT = "/var/lib/wmsa";
public static Path get() {
public static Path getHomePath() {
var ret = Path.of(System.getProperty("WMSA_HOME", DEFAULT));
if (!Files.isDirectory(ret)) {
throw new IllegalStateException("Could not find WMSA_HOME, either set environment variable or ensure " + DEFAULT + " exists");
}
return ret;
}
public static HostsFile getHostsFile() {
Path hostsFile = getHomePath().resolve("conf/hosts");
if (Files.isRegularFile(hostsFile)) {
try {
return new HostsFile(hostsFile);
} catch (IOException e) {
throw new RuntimeException("Failed to load hosts file " + hostsFile, e);
}
}
else {
return new HostsFile();
}
}
}

View file

@ -35,7 +35,7 @@ public class DatabaseModule extends AbstractModule {
}
private Properties loadDbProperties() {
Path propDir = WmsaHome.get().resolve("db.properties");
Path propDir = WmsaHome.getHomePath().resolve("db.properties");
if (!Files.isRegularFile(propDir)) {
throw new IllegalStateException("Database properties file " + propDir + " does not exist");
}

View file

@ -33,7 +33,7 @@ public class LoaderMain {
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
private final LoaderFactory loaderFactory;
private final EdgeIndexClient indexClient;
private final boolean running = true;
private volatile boolean running = true;
final Thread processorThread = new Thread(this::processor, "Processor Thread");
@ -82,8 +82,11 @@ public class LoaderMain {
load(entry.path(), entry.cnt());
});
running = false;
processorThread.join();
indexClient.close();
System.exit(0);
}
private volatile static int loadTotal;

View file

@ -0,0 +1,81 @@
package nu.marginalia.wmsa.edge.converting;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import okhttp3.MediaType;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.RequestBody;
import okio.BufferedSink;
import org.jetbrains.annotations.Nullable;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.Charset;
import java.sql.SQLException;
import java.util.concurrent.TimeUnit;
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
public class ReindexTriggerMain {
public static void main(String... args) throws IOException, SQLException {
var db = new DatabaseModule();
var client = new OkHttpClient.Builder()
.connectTimeout(100, TimeUnit.MILLISECONDS)
.readTimeout(15, TimeUnit.MINUTES)
.retryOnConnectionFailure(true)
.followRedirects(true)
.build();
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
while (rs.next()) {
System.out.printf("%d %s %s %d\n",
rs.getInt(1),
rs.getString(2),
rs.getString(3),
rs.getInt(4));
}
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100");
while (rs.next()) {
System.out.printf("%d %d %s %d %s\n",
rs.getInt(1),
rs.getInt(2),
rs.getString(3),
rs.getInt(4),
rs.getString(5));
}
stmt.executeUpdate("INSERT IGNORE INTO DOMAIN_METADATA(ID,GOOD_URLS,KNOWN_URLS,VISITED_URLS) SELECT ID,0,0,0 FROM EC_DOMAIN WHERE INDEXED>0");
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED AND STATE='ok' GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET GOOD_URLS=CNT");
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET KNOWN_URLS=CNT");
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET VISITED_URLS=CNT");
}
var rb = new RequestBody() {
@Nullable
@Override
public MediaType contentType() {
return MediaType.parse("text/plain");
}
@Override
public void writeTo(BufferedSink sink) throws IOException {
sink.writeString("NOOP", Charset.defaultCharset());
}
};
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute();
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute();
for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) {
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute();
}
}
}

View file

@ -25,6 +25,8 @@ public class IndexLoadKeywords implements Runnable {
private final Thread runThread;
private volatile boolean canceled = false;
private static final int index = Integer.getInteger("keyword-index", 1);
@Inject
public IndexLoadKeywords(EdgeIndexClient client) {
this.client = client;
@ -37,7 +39,7 @@ public class IndexLoadKeywords implements Runnable {
while (!canceled) {
var data = insertQueue.poll(1, TimeUnit.SECONDS);
if (data != null) {
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, 1).blockingSubscribe();
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe();
}
}
}

View file

@ -79,15 +79,21 @@ public class DocumentProcessor {
ret.url = new EdgeUrl(crawledDocument.url);
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
if (ret.state == EdgeUrlState.OK && isAcceptedContentType(crawledDocument)) {
var detailsWords = createDetails(crawledDomain, crawledDocument);
if (ret.state == EdgeUrlState.OK) {
if (detailsWords.details().quality < minDocumentQuality) {
throw new DisqualifiedException(DisqualificationReason.QUALITY);
if (isAcceptedContentType(crawledDocument)) {
var detailsWords = createDetails(crawledDomain, crawledDocument);
if (detailsWords.details().quality < minDocumentQuality) {
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}
ret.details = detailsWords.details();
ret.words = detailsWords.words();
}
else {
throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE);
}
ret.details = detailsWords.details();
ret.words = detailsWords.words();
}
else {
throw new DisqualifiedException(DisqualificationReason.STATUS);
@ -95,7 +101,7 @@ public class DocumentProcessor {
}
catch (DisqualifiedException ex) {
ret.state = EdgeUrlState.DISQUALIFIED;
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
logger.info("Disqualified {}: {}", ret.url, ex.reason);
}
catch (Exception ex) {
ret.state = EdgeUrlState.DISQUALIFIED;

View file

@ -73,7 +73,7 @@ public class CrawlJobExtractorMain {
private final EdgeDomainBlacklistImpl blacklist;
private final Connection conn;
private final HashFunction hasher = Hashing.murmur3_128(0);
private static final HashFunction hasher = Hashing.murmur3_128(0);
public static void main(String... args) throws SQLException, IOException {
Driver driver = new Driver();
@ -97,6 +97,19 @@ public class CrawlJobExtractorMain {
}
}
public static void writeSpec(Path outFile, String domain, List<String> urls) throws IOException {
Gson gson = new GsonBuilder().create();
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
var job = new CrawlingSpecification();
job.crawlDepth = urls.size();
job.domain = domain;
job.id = createId(new EdgeDomain(domain));
job.urls = urls;
out.println(gson.toJson(job));
}
}
private record DomainWithId(String domainName, int id) {}
private Stream<CrawlingSpecification> extractDomains() {
@ -186,11 +199,11 @@ public class CrawlJobExtractorMain {
return spec;
}
private String createId(DomainWithId domainWithId) {
private static String createId(DomainWithId domainWithId) {
return hasher.hashUnencodedChars(domainWithId.domainName).toString();
}
private String createId(EdgeDomain domain) {
private static String createId(EdgeDomain domain) {
return hasher.hashUnencodedChars(domain.toString()).toString();
}

View file

@ -79,6 +79,9 @@ public class CrawlerMain implements AutoCloseable {
try (var crawler = new CrawlerMain(plan)) {
crawler.run();
}
// TODO (2022-05-24): Some thread isn't set to daemon mode, need to explicitly harakiri the process, find why?
System.exit(0);
}
private CrawledDomain fetchDomain(CrawlingSpecification specification) {

View file

@ -20,7 +20,7 @@ import java.time.LocalDateTime;
import java.util.*;
public class CrawlerRetreiver {
private static final long DEFAULT_CRAWL_DELAY_MS = 1000;
private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000);
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
private final HttpFetcher fetcher;
private final HashSet<EdgeUrl> visited;

View file

@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.index;
import com.google.inject.Inject;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException;
public class EdgeIndexControl {
@ -19,11 +20,15 @@ public class EdgeIndexControl {
System.gc();
for (IndexBlock block : IndexBlock.values()) {
try {
servicesFactory.getIndexConverter(id, block);
servicesFactory.getIndexConverter(id, block);
System.runFinalization();
System.gc();
}
catch (ConversionUnnecessaryException unnecessary) {
System.runFinalization();
System.gc();
}
}
System.runFinalization();

View file

@ -13,12 +13,18 @@ public class EdgeIndexModule extends AbstractModule {
public void configure() {
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
if (Boolean.getBoolean("small-ram")) {
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27);
}
else {
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
}
}
@Provides
public RankingSettings rankingSettings() {
Path dir = WmsaHome.get().resolve("conf/ranking-settings.yaml");
Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml");
return RankingSettings.from(dir);
}

View file

@ -88,8 +88,8 @@ public class IndexServicesFactory {
return new DictionaryReader(getDictionaryWriter());
}
@SneakyThrows
public SearchIndexConverter getIndexConverter(int id, IndexBlock block) {
public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException {
return new SearchIndexConverter(block, id, tmpFileDir,
preconverterOutputFile.get(id),
indexWriteWordsFile.get(id, block.id),
@ -146,14 +146,17 @@ public class IndexServicesFactory {
public Callable<Boolean> switchFilesJob(int id) {
return () -> {
for (int block = 0; block < IndexBlock.values().length; block++) {
Files.move(
indexWriteWordsFile.get(id, block).toPath(),
indexReadWordsFile.get(id, block).toPath(),
StandardCopyOption.REPLACE_EXISTING);
Files.move(
indexWriteUrlsFile.get(id, block).toPath(),
indexReadUrlsFile.get(id, block).toPath(),
StandardCopyOption.REPLACE_EXISTING);
if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) &&
Files.exists(indexWriteUrlsFile.get(id, block).toPath())) {
Files.move(
indexWriteWordsFile.get(id, block).toPath(),
indexReadWordsFile.get(id, block).toPath(),
StandardCopyOption.REPLACE_EXISTING);
Files.move(
indexWriteUrlsFile.get(id, block).toPath(),
indexReadUrlsFile.get(id, block).toPath(),
StandardCopyOption.REPLACE_EXISTING);
}
}
return true;
};

View file

@ -0,0 +1,10 @@
package nu.marginalia.wmsa.edge.index.service.index;
public class ConversionUnnecessaryException extends Exception {
public ConversionUnnecessaryException() {
}
@Override
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
}

View file

@ -61,6 +61,7 @@ public class SearchIndexConverter {
@Named("edge-index-write-urls-file") File outputFileUrls,
SearchIndexPartitioner partitioner,
EdgeDomainBlacklist blacklist)
throws ConversionUnnecessaryException
{
this.block = block;
this.bucketId = bucketId;
@ -77,16 +78,21 @@ public class SearchIndexConverter {
this.fileLength = raf.readLong();
this.wordCount = raf.readInt();
if (fileLength <= FILE_HEADER_SIZE) {
throw new ConversionUnnecessaryException();
}
var inputChannel = raf.getChannel();
ByteBuffer buffer = ByteBuffer.allocateDirect(10_000);
urlsFileSize = getUrlsSize(buffer, raf);
urlsFileSize = getUrlsSize(buffer, inputChannel);
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
urlsTmpFileChannel = new RandomAccessFile(tmpUrlsFile.toFile(), "rw").getChannel();
urlsTmpFileChannel = urlsTmpFileRaf.getChannel();
urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false);
urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, 1024*1024*256);
@ -114,6 +120,139 @@ public class SearchIndexConverter {
}
private long getUrlsSize(ByteBuffer buffer, FileChannel channel) throws IOException {
channel.position(FILE_HEADER_SIZE);
var reader = new IndexReader(buffer, channel) {
public long size;
@Override
public void eachWord(long urlId, int wordId) {
size++;
}
};
reader.read();
logger.info("Blacklist filtered {} URLs", reader.filtered);
logger.debug("URLs Size {} Mb", channel.position()/(1024*1024));
return reader.size;
}
private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException {
logger.debug("Table size = {}", wordIndexTable.length);
int[] wordIndex = new int[wordIndexTable.length];
raf.seek(FILE_HEADER_SIZE);
var channel = raf.getChannel();
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) {
var reader = new IndexReader(buffer, channel) {
@Override
public void eachWord(long urlId, int wordId) throws IOException {
if (wordId >= wordIndex.length)
return;
if (wordId != 0) {
if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) {
logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}",
wordId,
wordIndex[wordId],
wordIndexTable[wordId - 1],
wordIndexTable[wordId]);
throw new IllegalStateException();
}
}
if (wordId > 0) {
rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId));
} else {
rwf.put(wordIndex[wordId]++, translateUrl(urlId));
}
}
};
reader.read();
rwf.write(urlsTmpFileChannel);
}
urlsTmpFileChannel.force(false);
logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024));
if (wordIndexTable.length > 0) {
logger.debug("Sorting urls table");
sortUrls(wordIndexTable);
urlsTmpFileMap.force();
}
else {
logger.warn("urls table empty -- nothing to sort");
}
long idx = 0;
try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) {
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
if (wordIndexTable[0] != 0) {
int start = 0;
int end = (int) wordIndexTable[0];
idx += writer.write(idx, (int) wordIndexTable[0],
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
}
for (int i = 1; i < wordIndexTable.length; i++) {
if (wordIndexTable[i] != wordIndexTable[i - 1]) {
long start = wordIndexTable[i-1];
long end = wordIndexTable[i];
idx += writer.write(idx, (int) (end-start),
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
@SneakyThrows
private void sortUrls(long[] wordIndices) {
urlTmpFileSorter.sort( 0, (int) wordIndices[0]);
for (int i = 1; i < wordIndices.length; i++) {
urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1]));
}
}
private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception {
inputChannel.position(FILE_HEADER_SIZE);
logger.debug("Table size = {}", wordCount);
WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount);
ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE);
logger.debug("Reading words");
var reader = new IndexReader(buffer, inputChannel) {
@Override
public void eachWord(long urlId, int wordId) {
wordsTableWriter.acceptWord(wordId);
}
};
reader.read();
logger.debug("Rearranging table");
inputChannel.position(FILE_HEADER_SIZE);
wordsTableWriter.write(outputFileWords);
return wordsTableWriter.getTable();
}
@RequiredArgsConstructor
private class IndexReader {
private final ByteBuffer buffer;
@ -193,7 +332,7 @@ public class SearchIndexConverter {
public void eachUrl(Lock lock, int count, long urlId) throws IOException {
for (int i = 0; i < count; i++) {
int wordId = buffer.getInt();
if (acceptWord(lock, urlId, wordId, i, block.id)) {
if (acceptWord(lock, urlId)) {
eachWord(urlId, wordId);
}
}
@ -201,183 +340,16 @@ public class SearchIndexConverter {
public void eachWord(long urlId, int wordId) throws IOException {
}
}
private long getUrlsSize(ByteBuffer buffer, RandomAccessFile raf) throws IOException {
raf.seek(FILE_HEADER_SIZE);
boolean acceptWord(Lock lock, long urlId) {
int domainId = (int) (urlId >>> 32L);
var channel = raf.getChannel();
var reader = new IndexReader(buffer, channel) {
public long size;
@Override
public void eachWord(long urlId, int wordId) {
size++;
}
};
reader.read();
logger.info("Blacklist filtered {} URLs", reader.filtered);
logger.debug("URLs Size {} Mb", channel.position()/(1024*1024));
return reader.size;
}
private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException {
logger.debug("Table size = {}", wordIndexTable.length);
int[] wordIndex = new int[wordIndexTable.length];
raf.seek(FILE_HEADER_SIZE);
var channel = raf.getChannel();
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) {
var reader = new IndexReader(buffer, channel) {
@Override
public void eachWord(long urlId, int wordId) throws IOException {
if (wordId >= wordIndex.length)
return;
if (wordId != 0) {
if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) {
logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}",
wordId,
wordIndex[wordId],
wordIndexTable[wordId - 1],
wordIndexTable[wordId]);
throw new IllegalStateException();
}
}
if (wordId > 0) {
rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId));
} else {
rwf.put(wordIndex[wordId]++, translateUrl(urlId));
}
}
};
reader.read();
rwf.write(urlsTmpFileChannel);
}
urlsTmpFileChannel.force(false);
logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024));
if (wordIndexTable.length > 0) {
logger.debug("Sorting urls table");
sortUrls(wordIndexTable);
urlsTmpFileMap.force();
}
else {
logger.warn("urls table empty -- nothing to sort");
}
long idx = 0;
var copyBuffer = ByteBuffer.allocateDirect(4096);
try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) {
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
if (wordIndexTable[0] != 0) {
int start = 0;
int end = (int) wordIndexTable[0];
idx += writer.write(idx, (int) wordIndexTable[0],
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
if (!partitioner.filterUnsafe(lock, domainId, bucketId)) {
return false;
}
for (int i = 1; i < wordIndexTable.length; i++) {
if (wordIndexTable[i] != wordIndexTable[i - 1]) {
long start = wordIndexTable[i-1];
long end = wordIndexTable[i];
idx += writer.write(idx, (int) (end-start),
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
}
}
} catch (Exception e) {
e.printStackTrace();
return true;
}
logger.warn("BTrees generated");
}
public void transfer(ByteBuffer buffer, MultimapFileLong dest, FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
int tbw = 0;
buffer.limit(Math.min(buffer.capacity(), (int)(sourceEnd - sourceStart)*8));
while (sourceEnd - sourceStart - tbw > buffer.limit()/8) {
int bw = 0;
while (buffer.position() < buffer.limit()) {
int r = sourceChannel.read(buffer, sourceStart*8 + bw);
if (r < 0) {
throw new IOException("");
}
bw += r;
}
buffer.flip();
dest.write(buffer.asLongBuffer(), destOffset + tbw);
tbw += bw/8;
buffer.clear();
buffer.limit(Math.min(buffer.capacity(), (int)(sourceEnd*8 - sourceStart*8 - tbw)));
}
buffer.clear();
buffer.limit((int)(sourceEnd - (sourceStart + tbw))*8);
int bw = 0;
while (bw < buffer.limit()) {
bw += sourceChannel.read(buffer, sourceStart + bw);
}
buffer.flip();
dest.write(buffer.asLongBuffer(), destOffset + tbw);
}
@SneakyThrows
private void sortUrls(long[] wordIndices) {
urlTmpFileSorter.sort( 0, (int) wordIndices[0]);
for (int i = 1; i < wordIndices.length; i++) {
urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1]));
}
}
private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception {
inputChannel.position(FILE_HEADER_SIZE);
logger.debug("Table size = {}", wordCount);
WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount);
ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE);
logger.debug("Reading words");
var reader = new IndexReader(buffer, inputChannel) {
@Override
public void eachWord(long urlId, int wordId) {
wordsTableWriter.acceptWord(wordId);
}
};
reader.read();
logger.debug("Rearranging table");
inputChannel.position(FILE_HEADER_SIZE);
wordsTableWriter.write(outputFileWords);
return wordsTableWriter.getTable();
}
boolean acceptWord(Lock lock, long urlId, int wordId, int wordIdx, int block) {
int domainId = (int) (urlId >>> 32L);
if (!partitioner.filterUnsafe(lock, domainId, bucketId)) {
return false;
}
return true;
}
}

View file

@ -18,7 +18,14 @@ DROP VIEW IF EXISTS EC_URL_PART_HASH;
DROP TABLE IF EXISTS EC_URL_WORD;
DROP TABLE IF EXISTS EC_DICTIONARY;
DROP TABLE IF EXISTS DOMAIN_METADATA;
CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
ID INT PRIMARY KEY,
KNOWN_URLS INT DEFAULT 0,
VISITED_URLS INT DEFAULT 0,
GOOD_URLS INT DEFAULT 0
);
CREATE TABLE IF NOT EXISTS EC_TOP_DOMAIN (
ID INT PRIMARY KEY AUTO_INCREMENT,

View file

@ -1,8 +0,0 @@
import org.junit.jupiter.api.Test;
public class EmptyTest {
@Test
public void test() {
}
}

View file

@ -1,5 +1,6 @@
package nu.marginalia.util;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import java.nio.file.Files;
@ -7,10 +8,9 @@ import java.nio.file.Path;
import java.util.Optional;
public class TestLanguageModels {
private static final Path LANGUAGE_MODELS_DEFAULT = Path.of("/home/vlofgren/Work/ngrams/");
public static LanguageModels getLanguageModels() {
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
public static Path getLanguageModelsPath() {
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
.map(Path::of)
.orElse(LANGUAGE_MODELS_DEFAULT);
@ -18,14 +18,20 @@ public class TestLanguageModels {
if (!Files.isDirectory(languageModelsHome)) {
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
}
return languageModelsHome;
}
public static LanguageModels getLanguageModels() {
var languageModelsHome = getLanguageModelsPath();
return new LanguageModels(
languageModelsHome.resolve("ngrams-generous-emstr.bin"),
languageModelsHome.resolve("tfreq-generous-emstr.bin"),
languageModelsHome.resolve("opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tok.bin")
languageModelsHome.resolve("opennlp-tokens.bin")
);
}
}

View file

@ -18,8 +18,13 @@ public class TestUtil {
@SneakyThrows
public static HikariDataSource getConnection() {
return getConnection("jdbc:mysql://localhost:3306/WMSA_test");
}
@SneakyThrows
public static HikariDataSource getConnection(String connString) {
HikariConfig config = new HikariConfig();
config.setJdbcUrl("jdbc:mysql://localhost:3306/WMSA_test");
config.setJdbcUrl(connString);
config.setUsername("wmsa");
config.setPassword("wmsa");
config.setMaximumPoolSize(16);
@ -29,6 +34,7 @@ public class TestUtil {
return new HikariDataSource(config);
}
@SneakyThrows
public static void evalScript(HikariDataSource hds, String scriptFile) {

View file

@ -0,0 +1,69 @@
package nu.marginalia.wmsa.configuration;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.assertThrows;
class HostsFileTest {
Path tempFile;
@BeforeEach
public void setUp() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
}
@AfterEach
public void tearDown() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
}
@Test
public void testParseSunnyDay() throws IOException {
Files.writeString(tempFile, """
# Comment
edge-index 192.168.0.1
edge-search 192.168.1.1
auth 127.0.0.55
""");
var hf = new HostsFile(tempFile);
Assertions.assertEquals("192.168.0.1", hf.getHost(ServiceDescriptor.EDGE_INDEX));
}
@Test
public void testTooLong() throws IOException {
Files.writeString(tempFile, """
edge-index 192.168.0.1 this is where my homie lives
""");
assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile));
}
@Test
public void testTooShort() throws IOException {
Files.writeString(tempFile, """
edge-index
""");
assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile));
}
@Test
public void testBadName() throws IOException {
Files.writeString(tempFile, """
garum-factory 127.0.0.1
""");
assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile));
}
}

View file

@ -42,7 +42,7 @@ class DictionaryWriterTest {
System.out.println(hitsTotal);
}
*/
@Test @Disabled
@Test @Disabled @SneakyThrows
public void convert() {
new SearchIndexConverter(IndexBlock.Title, 0, Path.of("/tmp"),
new File("/home/vlofgren/page-index-0.dat"),

View file

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.index.service;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
@ -16,7 +17,7 @@ class SearchIndexConverterTest {
private final Logger logger = LoggerFactory.getLogger(getClass());
@Test @Disabled
@Test @Disabled @SneakyThrows
public void test() {
// File dictFile = new File("/home/vlofgren/dictionary.dat");
File inFile = new File("/home/vlofgren/Work/converter/3/page-index.dat");

View file

@ -63,7 +63,7 @@ class SearchIndexWriterTest {
return reader.findWord(block, budget, lv->true, dictionaryWriter.getReadOnly(word)).stream().toArray();
}
@Test
@Test @SneakyThrows
void put() throws IOException {
writer.put(new EdgeId<>(0), new EdgeId<>(1), IndexBlock.Words, Arrays.asList("Hello", "Salvete", "everyone!", "This", "is", "Bob"));
writer.put(new EdgeId<>(0), new EdgeId<>(2), IndexBlock.Words, Arrays.asList("Salvete", "omnes!", "Bob", "sum", "Hello"));

View file

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import org.junit.BeforeClass;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -17,7 +17,7 @@ class BodyQueryParserTest {
private static EnglishDictionary englishDictionary;
private static final LanguageModels lm = TestLanguageModels.getLanguageModels();
@BeforeClass
@BeforeAll
public static void init() {
dict = new NGramDict(lm);
englishDictionary = new EnglishDictionary(dict);

View file

@ -12,8 +12,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes;
import nu.marginalia.wmsa.memex.system.MemexFileWriter;
import nu.marginalia.wmsa.memex.system.MemexGitRepo;
import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem;
import org.junit.BeforeClass;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
@ -40,7 +40,7 @@ class GemtextChangeTest {
static final Logger logger = LoggerFactory.getLogger(GemtextChangeTest.class);
@BeforeClass
@BeforeAll
public static void init() {
RxJavaPlugins.setErrorHandler(e -> {

View file

@ -14,8 +14,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes;
import nu.marginalia.wmsa.memex.system.MemexFileWriter;
import nu.marginalia.wmsa.memex.system.MemexGitRepo;
import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem;
import org.junit.BeforeClass;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
@ -47,7 +47,7 @@ class GemtextTaskUpdateTest {
static final Logger logger = LoggerFactory.getLogger(GemtextTaskUpdateTest.class);
@BeforeClass
@BeforeAll
public static void init() {
RxJavaPlugins.setErrorHandler(e -> {

View file

@ -11,8 +11,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes;
import nu.marginalia.wmsa.memex.system.MemexFileWriter;
import nu.marginalia.wmsa.memex.system.MemexGitRepo;
import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem;
import org.junit.BeforeClass;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
@ -38,7 +38,7 @@ class GemtextTombstoneUpdateCaclulatorTest {
static final Logger logger = LoggerFactory.getLogger(GemtextTombstoneUpdateCaclulatorTest.class);
@BeforeClass
@BeforeAll
public static void init() {
RxJavaPlugins.setErrorHandler(e -> {