Merge branch 'encyclopedia-service' into master
This commit is contained in:
commit
75c4986532
37 changed files with 878 additions and 1601 deletions
|
@ -0,0 +1,70 @@
|
|||
package nu.marginalia.wmsa.edge;
|
||||
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.testcontainers.containers.BindMode;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.containers.Network;
|
||||
import org.testcontainers.containers.output.Slf4jLogConsumer;
|
||||
import org.testcontainers.containers.wait.strategy.Wait;
|
||||
import org.testcontainers.utility.MountableFile;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
|
||||
public abstract class E2ETestBase {
|
||||
public Network network = Network.newNetwork();
|
||||
|
||||
public MariaDBContainer<?> getMariaDBContainer() {
|
||||
return new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withInitScript("sql/edge-crawler-cache.sql")
|
||||
.withNetwork(network)
|
||||
.withNetworkAliases("mariadb");
|
||||
}
|
||||
|
||||
public GenericContainer<?> forService(ServiceDescriptor service, GenericContainer<?> mariaDB) {
|
||||
return new GenericContainer<>("openjdk:17-alpine")
|
||||
.dependsOn(mariaDB)
|
||||
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
|
||||
.withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh")
|
||||
.withExposedPorts(service.port)
|
||||
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
|
||||
.withNetwork(network)
|
||||
.withNetworkAliases(service.name)
|
||||
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name)))
|
||||
.withCommand("sh", "init.sh", service.name)
|
||||
.waitingFor(Wait.forHttp("/internal/ping")
|
||||
.forPort(service.port)
|
||||
.withReadTimeout(Duration.ofSeconds(15)))
|
||||
;
|
||||
}
|
||||
|
||||
public static MountableFile jarFile() {
|
||||
Path cwd = Path.of(System.getProperty("user.dir"));
|
||||
|
||||
cwd = cwd.resolve("..");
|
||||
var jarFile = cwd.resolve("build/libs/wmsa-SNAPSHOT-all.jar");
|
||||
if (!Files.exists(jarFile)) {
|
||||
System.err.println("Could not find jarFile " + jarFile);
|
||||
throw new RuntimeException();
|
||||
}
|
||||
else {
|
||||
System.out.println("jar file = " + jarFile);
|
||||
}
|
||||
return MountableFile.forHostPath(jarFile);
|
||||
}
|
||||
|
||||
public static String modelsPath() {
|
||||
Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models");
|
||||
if (!Files.isDirectory(modelsPath)) {
|
||||
System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath());
|
||||
throw new RuntimeException();
|
||||
}
|
||||
return modelsPath.toString();
|
||||
}
|
||||
}
|
|
@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge;
|
|||
|
||||
|
||||
import nu.marginalia.util.test.TestUtil;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
|
@ -19,7 +18,6 @@ import org.testcontainers.junit.jupiter.Container;
|
|||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.MountableFile;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
@ -28,28 +26,19 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
|
||||
import static org.testcontainers.containers.BrowserWebDriverContainer.VncRecordingMode.RECORD_ALL;
|
||||
|
||||
@Tag("e2e")
|
||||
@Testcontainers
|
||||
public class EdgeSearchE2ETest {
|
||||
Network network = Network.newNetwork();
|
||||
public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
@Container
|
||||
public GenericContainer<?> mariaDB = getMariaDBContainer();
|
||||
|
||||
@Container
|
||||
public GenericContainer<?> mariaDB = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withInitScript("sql/edge-crawler-cache.sql")
|
||||
.withNetwork(network)
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
public GenericContainer<?> searchContainer = forService(EDGE_SEARCH, mariaDB);
|
||||
@Container
|
||||
public GenericContainer<?> searchContainer = forService(EDGE_SEARCH);
|
||||
public GenericContainer<?> assistantContainer = forService(EDGE_ASSISTANT, mariaDB);
|
||||
@Container
|
||||
public GenericContainer<?> assistantContainer = forService(EDGE_ASSISTANT);
|
||||
@Container
|
||||
public GenericContainer<?> indexContainer = forService(EDGE_INDEX);
|
||||
public GenericContainer<?> indexContainer = forService(EDGE_INDEX, mariaDB);
|
||||
|
||||
@Container
|
||||
public NginxContainer<?> mockWikipedia = new NginxContainer<>("nginx:stable")
|
||||
|
@ -88,46 +77,7 @@ public class EdgeSearchE2ETest {
|
|||
.withNetwork(network)
|
||||
.withNetworkAliases("proxyNginx");
|
||||
;
|
||||
public GenericContainer<?> forService(ServiceDescriptor service) {
|
||||
return new GenericContainer<>("openjdk:17-alpine")
|
||||
.dependsOn(mariaDB)
|
||||
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
|
||||
.withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh")
|
||||
.withExposedPorts(service.port)
|
||||
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
|
||||
.withNetwork(network)
|
||||
.withNetworkAliases(service.name)
|
||||
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name)))
|
||||
.withCommand("sh", "init.sh", service.name)
|
||||
.waitingFor(Wait.forHttp("/internal/ping")
|
||||
.forPort(service.port)
|
||||
.withReadTimeout(Duration.ofSeconds(15)))
|
||||
;
|
||||
}
|
||||
|
||||
public static MountableFile jarFile() {
|
||||
Path cwd = Path.of(System.getProperty("user.dir"));
|
||||
|
||||
cwd = cwd.resolve("..");
|
||||
var jarFile = cwd.resolve("build/libs/wmsa-SNAPSHOT-all.jar");
|
||||
if (!Files.exists(jarFile)) {
|
||||
System.err.println("Could not find jarFile " + jarFile);
|
||||
throw new RuntimeException();
|
||||
}
|
||||
else {
|
||||
System.out.println("jar file = " + jarFile);
|
||||
}
|
||||
return MountableFile.forHostPath(jarFile);
|
||||
}
|
||||
|
||||
public static String modelsPath() {
|
||||
Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models");
|
||||
if (!Files.isDirectory(modelsPath)) {
|
||||
System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath());
|
||||
throw new RuntimeException();
|
||||
}
|
||||
return modelsPath.toString();
|
||||
}
|
||||
public static MountableFile ipDatabasePath() {
|
||||
Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models/IP2LOC/IP2LOCATION-LITE-DB1.CSV");
|
||||
if (!Files.isRegularFile(modelsPath)) {
|
||||
|
|
|
@ -0,0 +1,151 @@
|
|||
package nu.marginalia.wmsa.edge;
|
||||
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.openqa.selenium.By;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.testcontainers.containers.*;
|
||||
import org.testcontainers.containers.output.Slf4jLogConsumer;
|
||||
import org.testcontainers.containers.wait.strategy.Wait;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.MountableFile;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Types;
|
||||
import java.time.Duration;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
@Tag("e2e")
|
||||
@Testcontainers
|
||||
public class EncyclopediaE2ETest extends E2ETestBase {
|
||||
@Container
|
||||
public MariaDBContainer<?> mariaDB = getMariaDBContainer();
|
||||
|
||||
@Container
|
||||
public GenericContainer<?> encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB);
|
||||
@Container
|
||||
public GenericContainer<?> encyclopediaLoader = new GenericContainer<>("openjdk:17")
|
||||
.dependsOn(encyclopediaContainer)
|
||||
.dependsOn(mariaDB)
|
||||
.withNetwork(network)
|
||||
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("encyclopedia-loader")))
|
||||
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
|
||||
.withCopyFileToContainer(MountableFile.forClasspathResource("load-encyclopedia.sh"), "/load-encyclopedia.sh")
|
||||
.withFileSystemBind(getModelData().toString(), "/data", BindMode.READ_ONLY)
|
||||
.withCommand("sh", "load-encyclopedia.sh")
|
||||
.waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10)));
|
||||
|
||||
@Container
|
||||
public NginxContainer<?> proxyNginx = new NginxContainer<>("nginx:stable")
|
||||
.dependsOn(encyclopediaLoader)
|
||||
.dependsOn(encyclopediaContainer)
|
||||
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx")))
|
||||
.withCopyFileToContainer(MountableFile.forClasspathResource("nginx/encyclopedia.conf"), "/etc/nginx/conf.d/default.conf")
|
||||
.withNetwork(network)
|
||||
.withNetworkAliases("proxyNginx");
|
||||
|
||||
@Container
|
||||
public BrowserWebDriverContainer<?> chrome = new BrowserWebDriverContainer<>()
|
||||
.withNetwork(network)
|
||||
.withCapabilities(new ChromeOptions());
|
||||
|
||||
private Gson gson = new GsonBuilder().create();
|
||||
private OkHttpClient httpClient = new OkHttpClient.Builder()
|
||||
.connectTimeout(100, TimeUnit.MILLISECONDS)
|
||||
.readTimeout(6000, TimeUnit.SECONDS)
|
||||
.retryOnConnectionFailure(true)
|
||||
.followRedirects(true)
|
||||
.build();
|
||||
|
||||
private Path getModelData() {
|
||||
return Path.of(System.getProperty("user.dir")).resolve("data/test");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void run() throws MalformedURLException {
|
||||
new Driver();
|
||||
|
||||
try (var conn = DriverManager.getConnection(mariaDB.getJdbcUrl(), "wmsa", "wmsa");
|
||||
var stmt = conn.prepareStatement("INSERT IGNORE INTO REF_WIKI_TITLE(NAME,REF_NAME) VALUES (?,?)")) {
|
||||
|
||||
stmt.setString(1, "Forg");
|
||||
stmt.setString(2, "Frog");
|
||||
stmt.executeUpdate();
|
||||
|
||||
stmt.setString(1, "Frog");
|
||||
stmt.setNull(2, Types.VARCHAR);
|
||||
stmt.executeUpdate();
|
||||
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
var driver = chrome.getWebDriver();
|
||||
|
||||
driver.get("http://proxyNginx/wiki/Frog");
|
||||
System.out.println(driver.getTitle());
|
||||
driver.get("http://proxyNginx/wiki-search?query=Forg");
|
||||
System.out.println(driver.getTitle());
|
||||
|
||||
assertTrue(get(encyclopediaContainer.getHost(),
|
||||
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
|
||||
"/wiki/has?url=Frog", Boolean.class));
|
||||
|
||||
assertFalse(get(encyclopediaContainer.getHost(),
|
||||
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
|
||||
"/wiki/has?url=Marginalia", Boolean.class));
|
||||
|
||||
assertFalse(get(encyclopediaContainer.getHost(),
|
||||
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
|
||||
"/wiki/has?url=Marginalia", Boolean.class));
|
||||
|
||||
|
||||
|
||||
var resultsForMarginalia = get(encyclopediaContainer.getHost(),
|
||||
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
|
||||
"/encyclopedia/Marginalia", WikiArticles.class);
|
||||
Assertions.assertTrue(resultsForMarginalia.getEntries().isEmpty());
|
||||
|
||||
var resultsForFrog = get(encyclopediaContainer.getHost(),
|
||||
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
|
||||
"/encyclopedia/Frog", WikiArticles.class);
|
||||
Assertions.assertFalse(resultsForFrog.getEntries().isEmpty());
|
||||
|
||||
var resultsForFoRg = get(encyclopediaContainer.getHost(),
|
||||
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
|
||||
"/encyclopedia/Forg", WikiArticles.class);
|
||||
Assertions.assertFalse(resultsForFoRg.getEntries().isEmpty());
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
private <T> T get(String host, Integer mappedPort, String path, Class<T> clazz) throws MalformedURLException {
|
||||
var req = new Request.Builder().get().url(new URL("http", host, mappedPort, path)).build();
|
||||
var call = httpClient.newCall(req);
|
||||
try (var rsp = call.execute()) {
|
||||
return gson.fromJson(rsp.body().charStream(), clazz);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
mkdir -p /var/lib/wmsa/conf/
|
||||
mkdir -p /var/lib/wmsa/data/
|
||||
|
||||
cat > /var/lib/wmsa/db.properties <<EOF
|
||||
cat > /var/lib/wmsa/conf/db.properties <<EOF
|
||||
db.user=wmsa
|
||||
db.pass=wmsa
|
||||
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
mkdir -p /var/lib/wmsa/encyclopedia
|
||||
mkdir -p /var/lib/wmsa/conf
|
||||
mkdir -p /var/lib/wmsa/index/write
|
||||
mkdir -p /var/lib/wmsa/index/read
|
||||
|
@ -21,7 +22,11 @@ many
|
|||
year
|
||||
EOF
|
||||
|
||||
cat > /var/lib/wmsa/db.properties <<EOF
|
||||
cat > /var/lib/wmsa/conf/disks.properties <<EOF
|
||||
encyclopedia=/var/lib/wmsa/encyclopedia
|
||||
EOF
|
||||
|
||||
cat > /var/lib/wmsa/conf/db.properties <<EOF
|
||||
db.user=wmsa
|
||||
db.pass=wmsa
|
||||
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||
|
@ -51,7 +56,7 @@ smhi-scraper smhi-scraper
|
|||
podcast-scraper podcast-scraper
|
||||
edge-index edge-index
|
||||
edge-search edge-search
|
||||
edge-archive edge-archive
|
||||
encyclopedia encyclopedia
|
||||
edge-assistant edge-assistant
|
||||
memex memex
|
||||
dating dating
|
||||
|
|
32
marginalia_nu/src/e2e/resources/load-encyclopedia.sh
Normal file
32
marginalia_nu/src/e2e/resources/load-encyclopedia.sh
Normal file
|
@ -0,0 +1,32 @@
|
|||
#!/bin/bash
|
||||
|
||||
mkdir -p /var/lib/wmsa/conf/
|
||||
mkdir -p /var/lib/wmsa/data/
|
||||
mkdir -p /data
|
||||
|
||||
cat > /var/lib/wmsa/conf/db.properties <<EOF
|
||||
db.user=wmsa
|
||||
db.pass=wmsa
|
||||
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||
EOF
|
||||
|
||||
cat > /var/lib/wmsa/conf/hosts <<EOF
|
||||
# service-name host-name
|
||||
resource-store resource-store
|
||||
renderer renderer
|
||||
auth auth
|
||||
api api
|
||||
smhi-scraper smhi-scraper
|
||||
podcast-scraper podcast-scraper
|
||||
edge-index edge-index
|
||||
edge-search edge-search
|
||||
encyclopedia encyclopedia
|
||||
edge-assistant edge-assistant
|
||||
memex memex
|
||||
dating dating
|
||||
EOF
|
||||
|
||||
java -cp WMSA.jar nu.marginalia.wmsa.edge.tools.EncyclopediaLoaderTool data/wikipedia_en_100_nopic.zim
|
||||
|
||||
|
||||
echo "ALL DONE"
|
40
marginalia_nu/src/e2e/resources/nginx/encyclopedia.conf
Normal file
40
marginalia_nu/src/e2e/resources/nginx/encyclopedia.conf
Normal file
|
@ -0,0 +1,40 @@
|
|||
server {
|
||||
listen 80;
|
||||
listen [::]:80;
|
||||
server_name nginx;
|
||||
|
||||
location /wiki/ {
|
||||
rewrite ^ $request_uri;
|
||||
rewrite ^/(.*) /public/$1 break;
|
||||
return 400;
|
||||
proxy_pass http://encyclopedia:5040$uri;
|
||||
|
||||
proxy_set_header X-Context $remote_addr-$connection;
|
||||
proxy_set_header X-Public "1";
|
||||
proxy_set_header X-Extern-Url $scheme://$host$request_uri;
|
||||
proxy_set_header X-Extern-Domain $scheme://$host;
|
||||
proxy_set_header X-User-Agent $http_user_agent;
|
||||
|
||||
tcp_nodelay on;
|
||||
}
|
||||
location /wiki-search {
|
||||
rewrite ^ $request_uri;
|
||||
rewrite ^/(.*) /public/$1 break;
|
||||
return 400;
|
||||
proxy_pass http://encyclopedia:5040$uri;
|
||||
|
||||
proxy_set_header X-Context $remote_addr-$connection;
|
||||
proxy_set_header X-Public "1";
|
||||
proxy_set_header X-Extern-Url $scheme://$host$request_uri;
|
||||
proxy_set_header X-Extern-Domain $scheme://$host;
|
||||
proxy_set_header X-User-Agent $http_user_agent;
|
||||
|
||||
|
||||
tcp_nodelay on;
|
||||
}
|
||||
|
||||
location / {
|
||||
proxy_pass http://encyclopedia:5040/;
|
||||
tcp_nodelay on;
|
||||
}
|
||||
}
|
|
@ -6,11 +6,11 @@ import nu.marginalia.wmsa.configuration.command.Command;
|
|||
import nu.marginalia.wmsa.configuration.command.ListCommand;
|
||||
import nu.marginalia.wmsa.configuration.command.StartCommand;
|
||||
import nu.marginalia.wmsa.configuration.command.VersionCommand;
|
||||
import nu.marginalia.wmsa.edge.archive.EdgeArchiveMain;
|
||||
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
|
||||
import nu.marginalia.wmsa.edge.dating.DatingMain;
|
||||
import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
|
||||
import nu.marginalia.wmsa.edge.search.EdgeSearchMain;
|
||||
import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain;
|
||||
import nu.marginalia.wmsa.memex.MemexMain;
|
||||
import nu.marginalia.wmsa.podcasts.PodcastScraperMain;
|
||||
import nu.marginalia.wmsa.renderer.RendererMain;
|
||||
|
@ -33,11 +33,12 @@ public enum ServiceDescriptor {
|
|||
|
||||
EDGE_INDEX("edge-index", 5021, EdgeIndexMain.class),
|
||||
EDGE_SEARCH("edge-search", 5023, EdgeSearchMain.class),
|
||||
EDGE_ARCHIVE("edge-archive", 5024, EdgeArchiveMain.class),
|
||||
EDGE_ASSISTANT("edge-assistant", 5025, EdgeAssistantMain.class),
|
||||
|
||||
EDGE_MEMEX("memex", 5030, MemexMain.class),
|
||||
|
||||
ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class),
|
||||
|
||||
DATING("dating", 5070, DatingMain.class),
|
||||
|
||||
TEST_1("test-1", 0, null),
|
||||
|
|
|
@ -3,6 +3,7 @@ package nu.marginalia.wmsa.configuration;
|
|||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Properties;
|
||||
|
||||
public class WmsaHome {
|
||||
private static final String DEFAULT = "/var/lib/wmsa";
|
||||
|
@ -32,4 +33,27 @@ public class WmsaHome {
|
|||
public static Path getIPLocationDatabse() {
|
||||
return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV");
|
||||
}
|
||||
|
||||
public static Path getDisk(String name) throws IOException {
|
||||
Path p = Path.of(getDiskProperties().getProperty(name));
|
||||
if (!Files.isDirectory(p)) {
|
||||
throw new IOException(name + " does not exist!");
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
public static Properties getDiskProperties() throws IOException {
|
||||
Path settingsFile = getHomePath().resolve("conf/disks.properties");
|
||||
|
||||
if (Files.isRegularFile(settingsFile)) {
|
||||
try (var is = Files.newInputStream(settingsFile)) {
|
||||
var props = new Properties();
|
||||
props.load(is);
|
||||
return props;
|
||||
}
|
||||
}
|
||||
else {
|
||||
throw new IOException("Could not find disk settings " + settingsFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ public class DatabaseModule extends AbstractModule {
|
|||
}
|
||||
|
||||
private Properties loadDbProperties() {
|
||||
Path propDir = WmsaHome.getHomePath().resolve("db.properties");
|
||||
Path propDir = WmsaHome.getHomePath().resolve("conf/db.properties");
|
||||
if (!Files.isRegularFile(propDir)) {
|
||||
throw new IllegalStateException("Database properties file " + propDir + " does not exist");
|
||||
}
|
||||
|
|
|
@ -1,33 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.archive;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.wmsa.configuration.MainClass;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
|
||||
public class EdgeArchiveMain extends MainClass {
|
||||
private final EdgeArchiveService service;
|
||||
|
||||
@Inject
|
||||
public EdgeArchiveMain(EdgeArchiveService service) {
|
||||
this.service = service;
|
||||
}
|
||||
|
||||
public static void main(String... args) {
|
||||
init(ServiceDescriptor.EDGE_ARCHIVE, args);
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new EdgeArchiveModule(),
|
||||
new ConfigurationModule(),
|
||||
new DatabaseModule()
|
||||
);
|
||||
|
||||
injector.getInstance(EdgeArchiveMain.class);
|
||||
injector.getInstance(Initialization.class).setReady();
|
||||
|
||||
}
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.archive;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.name.Names;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class EdgeArchiveModule extends AbstractModule {
|
||||
public void configure() {
|
||||
bind(Path.class).annotatedWith(Names.named("archive-path")).toInstance(Path.of("/var/lib/wmsa/archive/webpage/"));
|
||||
bind(Path.class).annotatedWith(Names.named("wiki-path")).toInstance(Path.of("/var/lib/wmsa/archive.fast/wiki/"));
|
||||
bind(Integer.class).annotatedWith(Names.named("archive-size")).toInstance(10_000);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,180 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.archive;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import io.prometheus.client.Histogram;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.edge.archive.archiver.ArchivedFile;
|
||||
import nu.marginalia.wmsa.edge.archive.archiver.Archiver;
|
||||
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
public class EdgeArchiveService extends Service {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
|
||||
private static final Histogram wmsa_archive_store_time = Histogram.build().name("wmsa_archive_store_time").help("-").register();
|
||||
private static final Histogram wmsa_archive_fetch_time = Histogram.build().name("wmsa_archive_fetch_time").help("-").register();
|
||||
|
||||
private final Path wikiPath;
|
||||
private final Archiver archiver;
|
||||
|
||||
@SneakyThrows
|
||||
@Inject
|
||||
public EdgeArchiveService(@Named("service-host") String ip,
|
||||
@Named("service-port") Integer port,
|
||||
@Named("wiki-path") Path wikiPath,
|
||||
Archiver archiver,
|
||||
Initialization initialization,
|
||||
MetricsServer metricsServer)
|
||||
{
|
||||
super(ip, port, initialization, metricsServer);
|
||||
this.wikiPath = wikiPath;
|
||||
this.archiver = archiver;
|
||||
|
||||
Spark.staticFiles.expireTime(600);
|
||||
|
||||
Spark.post("/page/submit", this::pathPageSubmit);
|
||||
|
||||
Spark.post("/wiki/submit", this::pathWikiSubmit);
|
||||
Spark.get("/wiki/has", this::pathWikiHas);
|
||||
Spark.get("/wiki/get", this::pathWikiGet);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object pathPageSubmit(Request request, Response response) {
|
||||
var timer = wmsa_archive_store_time.startTimer();
|
||||
try {
|
||||
var body = request.body();
|
||||
var data = gson.fromJson(body, EdgeArchiveSubmissionReq.class);
|
||||
|
||||
String domainNamePart = data.getUrl().domain.domain.length() > 32 ? data.getUrl().domain.domain.substring(0, 32) : data.getUrl().domain.domain;
|
||||
String fileName = String.format("%s-%10d", domainNamePart, data.getUrl().hashCode());
|
||||
|
||||
archiver.writeData(new ArchivedFile(fileName, body.getBytes()));
|
||||
|
||||
return "ok";
|
||||
} finally {
|
||||
timer.observeDuration();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Object pathWikiSubmit(Request request, Response response) {
|
||||
var timer = wmsa_archive_store_time.startTimer();
|
||||
|
||||
try {
|
||||
byte[] data = request.bodyAsBytes();
|
||||
|
||||
String wikiUrl = request.queryParams("url");
|
||||
Path filename = getWikiFilename(wikiPath, wikiUrl);
|
||||
|
||||
Files.createDirectories(filename.getParent());
|
||||
|
||||
System.out.println(new String(data));
|
||||
logger.debug("Writing {} to {}", wikiUrl, filename);
|
||||
|
||||
try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) {
|
||||
gos.write(data);
|
||||
gos.flush();
|
||||
}
|
||||
|
||||
return "ok";
|
||||
} finally {
|
||||
timer.observeDuration();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Path getWikiFilename(Path base, String url) {
|
||||
Path p = base;
|
||||
|
||||
int urlHash = url.hashCode();
|
||||
|
||||
p = p.resolve(Integer.toString(urlHash & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>8) & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>16) & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>24) & 0xFF));
|
||||
|
||||
String fileName = url.chars()
|
||||
.mapToObj(this::encodeUrlChar)
|
||||
.collect(Collectors.joining());
|
||||
|
||||
if (fileName.length() > 128) {
|
||||
fileName = fileName.substring(0, 128) + (((long)urlHash)&0xFFFFFFFFL);
|
||||
}
|
||||
|
||||
return p.resolve(fileName + ".gz");
|
||||
}
|
||||
|
||||
|
||||
private String encodeUrlChar(int i) {
|
||||
if (i >= 'a' && i <= 'z') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i >= 'A' && i <= 'Z') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i >= '0' && i <= '9') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i == '.') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
else {
|
||||
return String.format("%%%2X", i);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object pathWikiHas(Request request, Response response) {
|
||||
return Files.exists(getWikiFilename(wikiPath, request.queryParams("url")));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private String pathWikiGet(Request request, Response response) {
|
||||
var timer = wmsa_archive_fetch_time.startTimer();
|
||||
|
||||
try {
|
||||
String url = request.queryParams("url");
|
||||
|
||||
var filename = getWikiFilename(wikiPath, url);
|
||||
|
||||
if (Files.exists(filename)) {
|
||||
try (var stream = new GZIPInputStream(new FileInputStream(filename.toFile()))) {
|
||||
return new String(stream.readAllBytes());
|
||||
}
|
||||
} else {
|
||||
Spark.halt(404);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
finally {
|
||||
timer.observeDuration();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,65 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.archive.archiver;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public class ArchiveExtractor {
|
||||
private final Path archivePath;
|
||||
private final String arhivePattern = "archive-%04d.tar.gz";
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
|
||||
public ArchiveExtractor(Path archivePath) {
|
||||
this.archivePath = archivePath;
|
||||
|
||||
}
|
||||
|
||||
public void forEach(Consumer<EdgeRawPageContents> contents) {
|
||||
for (int i = 0; ; ++i) {
|
||||
var fn = getArchiveFile(i);
|
||||
logger.info("{}", fn);
|
||||
if (!Files.exists(fn)) {
|
||||
break;
|
||||
}
|
||||
try (var stream = new TarArchiveInputStream(new GzipCompressorInputStream(new BufferedInputStream(new FileInputStream(fn.toFile()))))) {
|
||||
TarArchiveEntry entry;
|
||||
while ((entry = stream.getNextTarEntry()) != null) {
|
||||
if (entry.isFile()) {
|
||||
try {
|
||||
var obj = gson.fromJson(new InputStreamReader(stream), EdgeArchiveSubmissionReq.class);
|
||||
if (obj != null) {
|
||||
contents.accept(obj.getData());
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Could not unpack {} - {} {}", entry.getName(), ex.getClass().getSimpleName(), ex.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Path getArchiveFile(int number) {
|
||||
final String fileName = String.format(arhivePattern, number);
|
||||
return archivePath.resolve(fileName);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.archive.archiver;
|
||||
|
||||
|
||||
public record ArchivedFile(String filename,byte[] data ) {
|
||||
}
|
|
@ -1,113 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.archive.archiver;
|
||||
|
||||
import com.google.inject.name.Named;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@Singleton
|
||||
public class Archiver implements AutoCloseable {
|
||||
private final Path archivePath;
|
||||
private final int filesPerArchive;
|
||||
private final String arhivePattern = "archive-%04d.tar.gz";
|
||||
|
||||
private final LinkedBlockingDeque<ArchivedFile> writeQueue = new LinkedBlockingDeque<>(10);
|
||||
private final Thread writeThread;
|
||||
|
||||
private volatile int archiveNumber;
|
||||
private volatile boolean running;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public Archiver(@Named("archive-path") Path archivePath, @Named("archive-size") Integer filesPerArchive) {
|
||||
this.archivePath = archivePath;
|
||||
this.filesPerArchive = filesPerArchive;
|
||||
|
||||
if (!Files.exists(archivePath)) {
|
||||
throw new IllegalArgumentException("Archive path does not exist");
|
||||
}
|
||||
for (int i = 0;; ++i) {
|
||||
if (!Files.exists(getArchiveFile(i))) {
|
||||
archiveNumber = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
running = true;
|
||||
writeThread = new Thread(this::writeThreadMain, "ArchiveWriteThread");
|
||||
writeThread.start();
|
||||
}
|
||||
|
||||
private Path getArchiveFile(int number) {
|
||||
final String fileName = String.format(arhivePattern, number);
|
||||
return archivePath.resolve(fileName);
|
||||
}
|
||||
|
||||
public void writeData(ArchivedFile file) throws InterruptedException {
|
||||
if (!running) throw new IllegalStateException("Archiver is closing or closed");
|
||||
writeQueue.put(file);
|
||||
}
|
||||
|
||||
private void writeThreadMain() {
|
||||
try {
|
||||
while (running || !writeQueue.isEmpty()) {
|
||||
writeToFile(archiveNumber);
|
||||
archiveNumber++;
|
||||
}
|
||||
running = false;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Uncaught exception in writer thread!!");
|
||||
}
|
||||
}
|
||||
|
||||
private void writeToFile(int archiveNumber) {
|
||||
var archiveFile = getArchiveFile(archiveNumber);
|
||||
|
||||
logger.info("Switching to file {}", archiveFile);
|
||||
|
||||
try (TarArchiveOutputStream taos = new TarArchiveOutputStream(new GzipCompressorOutputStream(new FileOutputStream(archiveFile.toFile())))) {
|
||||
for (int i = 0; i < filesPerArchive; i++) {
|
||||
|
||||
ArchivedFile writeJob = null;
|
||||
while (writeJob == null) {
|
||||
writeJob = writeQueue.poll(1, TimeUnit.SECONDS);
|
||||
if (!running) return;
|
||||
}
|
||||
|
||||
var entry = new TarArchiveEntry(String.format("%06d-%s", i, writeJob.filename()));
|
||||
entry.setSize(writeJob.data().length);
|
||||
taos.putArchiveEntry(entry);
|
||||
logger.debug("Writing {} to {}", writeJob.filename(), archiveFile);
|
||||
try (var bais = new ByteArrayInputStream(writeJob.data())) {
|
||||
IOUtils.copy(bais, taos);
|
||||
}
|
||||
taos.closeArchiveEntry();
|
||||
}
|
||||
taos.finish();
|
||||
logger.debug("Finishing {}", archiveFile);
|
||||
} catch (Exception e) {
|
||||
logger.error("Error", e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
running = false;
|
||||
writeThread.join();
|
||||
}
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.archive.client;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
||||
import nu.marginalia.wmsa.client.HttpStatusCode;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import okhttp3.MediaType;
|
||||
import org.eclipse.jetty.util.UrlEncoded;
|
||||
|
||||
import javax.annotation.CheckReturnValue;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
@Singleton
|
||||
public class ArchiveClient extends AbstractDynamicClient {
|
||||
|
||||
private final Semaphore submitPageSem = new Semaphore(3, true);
|
||||
|
||||
@Inject
|
||||
public ArchiveClient() {
|
||||
super(ServiceDescriptor.EDGE_ARCHIVE);
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public void submitPage(Context ctx, EdgeUrl url, EdgeRawPageContents data) throws InterruptedException {
|
||||
try {
|
||||
submitPageSem.acquire();
|
||||
super.post(ctx, "/page/submit", new EdgeArchiveSubmissionReq(url, data)).blockingSubscribe();
|
||||
}
|
||||
finally {
|
||||
submitPageSem.release();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<HttpStatusCode> submitWiki(Context ctx, String url, String data) {
|
||||
return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8"));
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<Boolean> hasWiki(Context ctx, String url) {
|
||||
return super.get(ctx, "/wiki/has?url="+UrlEncoded.encodeString(url), Boolean.class);
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<String> getWiki(Context ctx, String url) {
|
||||
return super.get(ctx, "/wiki/get?url="+UrlEncoded.encodeString(url));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,13 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.archive.request;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
@AllArgsConstructor @Getter @ToString
|
||||
public class EdgeArchiveSubmissionReq {
|
||||
EdgeUrl url;
|
||||
EdgeRawPageContents data;
|
||||
}
|
|
@ -4,36 +4,27 @@ import com.google.gson.Gson;
|
|||
import com.google.gson.GsonBuilder;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.server.*;
|
||||
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.DictionaryService;
|
||||
import nu.marginalia.wmsa.edge.assistant.eval.MathParser;
|
||||
import nu.marginalia.wmsa.edge.assistant.eval.Units;
|
||||
import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
|
||||
import nu.marginalia.wmsa.edge.assistant.suggest.Suggestions;
|
||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
public class EdgeAssistantService extends Service {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
private final Units units;
|
||||
private final DictionaryService dictionaryService;
|
||||
private final MathParser mathParser;
|
||||
private final ArchiveClient archiveClient;
|
||||
private final ScreenshotService screenshotService;
|
||||
private final MustacheRenderer<String> wikiErrorPageRenderer;
|
||||
private final MustacheRenderer<Object> wikiSearchResultRenderer;
|
||||
private final Suggestions suggestions;
|
||||
|
||||
@SneakyThrows
|
||||
|
@ -45,40 +36,22 @@ public class EdgeAssistantService extends Service {
|
|||
DictionaryService dictionaryService,
|
||||
MathParser mathParser,
|
||||
Units units,
|
||||
ArchiveClient archiveClient,
|
||||
RendererFactory rendererFactory,
|
||||
ScreenshotService screenshotService,
|
||||
Suggestions suggestions
|
||||
)
|
||||
{
|
||||
super(ip, port, initialization, metricsServer);
|
||||
this.dictionaryService = dictionaryService;
|
||||
this.mathParser = mathParser;
|
||||
this.units = units;
|
||||
this.archiveClient = archiveClient;
|
||||
this.screenshotService = screenshotService;
|
||||
this.suggestions = suggestions;
|
||||
|
||||
Spark.staticFiles.expireTime(600);
|
||||
|
||||
if (rendererFactory != null) {
|
||||
wikiErrorPageRenderer = rendererFactory.renderer("encyclopedia/wiki-error");
|
||||
wikiSearchResultRenderer = rendererFactory.renderer("encyclopedia/wiki-search");
|
||||
}
|
||||
else {
|
||||
wikiErrorPageRenderer = null;
|
||||
wikiSearchResultRenderer = null;
|
||||
}
|
||||
|
||||
Spark.get("/public/wiki/*", this::getWikiPage);
|
||||
Spark.get("/public/wiki-search", this::searchWikiPage);
|
||||
|
||||
Spark.get("/public/screenshot/:id", screenshotService::serveScreenshotRequest);
|
||||
Spark.get("/screenshot/:id", screenshotService::serveScreenshotRequest);
|
||||
|
||||
Spark.get("/dictionary/:word", (req, rsp) -> dictionaryService.define(req.params("word")), this::convertToJson);
|
||||
Spark.get("/spell-check/:term", (req, rsp) -> dictionaryService.spellCheck(req.params("term").toLowerCase()), this::convertToJson);
|
||||
Spark.get("/encyclopedia/:term", (req, rsp) -> dictionaryService.encyclopedia(req.params("term")), this::convertToJson);
|
||||
Spark.get("/unit-conversion", (req, rsp) -> unitConversion(
|
||||
rsp,
|
||||
req.queryParams("value"),
|
||||
|
@ -106,57 +79,6 @@ public class EdgeAssistantService extends Service {
|
|||
return suggestions.getSuggestions(10, param);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object getWikiPage(Request req, Response rsp) {
|
||||
final var ctx = Context.fromRequest(req);
|
||||
|
||||
final String[] splats = req.splat();
|
||||
if (splats.length == 0)
|
||||
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
|
||||
|
||||
|
||||
final String s = splats[0];
|
||||
|
||||
String pageName = dictionaryService.resolveEncylopediaRedirect(s).orElse(s);
|
||||
logger.info("Resolved {} -> {}", s, pageName);
|
||||
return archiveClient.getWiki(ctx, pageName)
|
||||
.onErrorResumeWith(resolveWikiPageNameWrongCase(ctx, s))
|
||||
.blockingFirst();
|
||||
}
|
||||
|
||||
private Observable<String> resolveWikiPageNameWrongCase(Context ctx, String s) {
|
||||
var rsp = dictionaryService.findEncyclopediaPageDirect(s);
|
||||
if (rsp.isEmpty()) {
|
||||
return renderSearchPage(s);
|
||||
}
|
||||
return archiveClient.getWiki(ctx, rsp.get().getInternalName())
|
||||
.onErrorResumeWith(renderSearchPage(s));
|
||||
}
|
||||
|
||||
private Observable<String> renderSearchPage(String s) {
|
||||
return Observable.fromCallable(() -> wikiSearchResultRenderer.render(
|
||||
Map.of("query", s,
|
||||
"error", "true",
|
||||
"results", dictionaryService.findEncyclopediaPages(s))));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object searchWikiPage(Request req, Response rsp) {
|
||||
final var ctx = Context.fromRequest(req);
|
||||
|
||||
String term = req.queryParams("query");
|
||||
if (null == term) {
|
||||
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
|
||||
return "";
|
||||
}
|
||||
|
||||
return wikiSearchResultRenderer.render(
|
||||
Map.of("query", term,
|
||||
"results",
|
||||
dictionaryService.findEncyclopediaPages(term))
|
||||
);
|
||||
}
|
||||
|
||||
private Object evalExpression(Response rsp, String value) {
|
||||
try {
|
||||
var val = mathParser.evalFormatted(value);
|
||||
|
|
|
@ -24,10 +24,6 @@ public class AssistantClient extends AbstractDynamicClient {
|
|||
return super.get(ctx,"/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class);
|
||||
}
|
||||
|
||||
public Observable<WikiArticles> encyclopediaLookup(Context ctx, String word) {
|
||||
return super.get(ctx,"/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public Observable<List<String>> spellCheck(Context ctx, String word) {
|
||||
return (Observable<List<String>>) (Object) super.get(ctx,"/spell-check/" + UrlEncoded.encodeString(word), List.class);
|
||||
|
|
|
@ -43,142 +43,6 @@ public class DictionaryService {
|
|||
return response;
|
||||
}
|
||||
|
||||
public WikiArticles encyclopedia(String term) {
|
||||
WikiArticles response = new WikiArticles();
|
||||
response.entries = new ArrayList<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
var stmt = connection.prepareStatement("SELECT DISTINCT(NAME_LOWER) FROM REF_WIKI_TITLE WHERE NAME_LOWER=?");
|
||||
stmt.setString(1, term);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
response.entries.add(capitalizeWikiString(rsp.getString(1)));
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to fetch articles", ex);
|
||||
return new WikiArticles();
|
||||
}
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
public Optional<String> resolveEncylopediaRedirect(String term) {
|
||||
final List<String> matches = new ArrayList<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
if (term.equals(rsp.getString(1))
|
||||
|| rsp.getString(2) == null) {
|
||||
return Optional.ofNullable(rsp.getString(2));
|
||||
} else {
|
||||
matches.add(rsp.getString(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
if (!matches.isEmpty()) {
|
||||
return Optional.of(matches.get(0));
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public Optional<WikiSearchResult> findEncyclopediaPageDirect(String term) {
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term.replace(' ', '_'));
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
return Optional.of(new WikiSearchResult(name, null));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public List<WikiSearchResult> findEncyclopediaPages(String term) {
|
||||
final List<WikiSearchResult> directMatches = new ArrayList<>();
|
||||
final Set<WikiSearchResult> directSearchMatches = new HashSet<>();
|
||||
final Set<WikiSearchResult> indirectMatches = new HashSet<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term.replace(' ', '_'));
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
directMatches.add(new WikiSearchResult(name, null));
|
||||
} else {
|
||||
indirectMatches.add(new WikiSearchResult(name, refName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER LIKE ? LIMIT 10")) {
|
||||
stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%");
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
directSearchMatches.add(new WikiSearchResult(name, null));
|
||||
} else {
|
||||
indirectMatches.add(new WikiSearchResult(name, refName));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
directMatches.forEach(indirectMatches::remove);
|
||||
indirectMatches.removeAll(directSearchMatches);
|
||||
directMatches.forEach(directSearchMatches::remove);
|
||||
directMatches.addAll(indirectMatches);
|
||||
directMatches.addAll(directSearchMatches);
|
||||
return directMatches;
|
||||
}
|
||||
|
||||
private String capitalizeWikiString(String string) {
|
||||
if (string.contains("_")) {
|
||||
return Arrays.stream(string.split("_")).map(this::capitalizeWikiString).collect(Collectors.joining("_"));
|
||||
}
|
||||
if (string.length() < 2) {
|
||||
return string.toUpperCase();
|
||||
}
|
||||
return Character.toUpperCase(string.charAt(0)) + string.substring(1).toLowerCase();
|
||||
}
|
||||
|
||||
public List<String> spellCheck(String word) {
|
||||
return spellChecker.correct(word);
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.search.results.SearchResultValuator;
|
|||
import nu.marginalia.wmsa.edge.search.results.model.AccumulatedQueryResults;
|
||||
import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator;
|
||||
import nu.marginalia.wmsa.edge.search.results.UrlDeduplicator;
|
||||
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -33,6 +34,7 @@ public class EdgeSearchOperator {
|
|||
|
||||
private static final Logger logger = LoggerFactory.getLogger(EdgeSearchOperator.class);
|
||||
private final AssistantClient assistantClient;
|
||||
private final EncyclopediaClient encyclopediaClient;
|
||||
private final EdgeDataStoreDao edgeDataStoreDao;
|
||||
private final EdgeIndexClient indexClient;
|
||||
private final QueryFactory queryFactory;
|
||||
|
@ -42,6 +44,7 @@ public class EdgeSearchOperator {
|
|||
|
||||
@Inject
|
||||
public EdgeSearchOperator(AssistantClient assistantClient,
|
||||
EncyclopediaClient encyclopediaClient,
|
||||
EdgeDataStoreDao edgeDataStoreDao,
|
||||
EdgeIndexClient indexClient,
|
||||
QueryFactory queryFactory,
|
||||
|
@ -50,6 +53,7 @@ public class EdgeSearchOperator {
|
|||
) {
|
||||
|
||||
this.assistantClient = assistantClient;
|
||||
this.encyclopediaClient = encyclopediaClient;
|
||||
this.edgeDataStoreDao = edgeDataStoreDao;
|
||||
this.indexClient = indexClient;
|
||||
this.queryFactory = queryFactory;
|
||||
|
@ -220,7 +224,7 @@ public class EdgeSearchOperator {
|
|||
|
||||
@NotNull
|
||||
private Observable<WikiArticles> getWikiArticle(Context ctx, String humanQuery) {
|
||||
return assistantClient
|
||||
return encyclopediaClient
|
||||
.encyclopediaLookup(ctx,
|
||||
humanQuery.replaceAll("\\s+", "_")
|
||||
.replaceAll("\"", "")
|
||||
|
|
|
@ -1,384 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.archive.archiver.ArchiveExtractor;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlStandardExtractor;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter;
|
||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.model.*;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.UNKNOWN;
|
||||
|
||||
public class ConverterMain {
|
||||
static final LinkedBlockingQueue<EdgeRawPageContents> processQueue = new LinkedBlockingQueue<>(20);
|
||||
static final LinkedBlockingQueue<UploadJob> uploadQueue = new LinkedBlockingQueue<>(2);
|
||||
|
||||
static final TObjectIntHashMap<String> urlToIdMap = new TObjectIntHashMap<>(50_000_000, 0.5f, -1);
|
||||
static final TObjectIntHashMap<String> domainToIdMap = new TObjectIntHashMap<>(5_000_000, 0.5f, -1);
|
||||
static final TIntObjectHashMap<String> idToDomainMap = new TIntObjectHashMap<>(5_000_000, 0.5f, -1);
|
||||
static HikariDataSource conn;
|
||||
|
||||
private static SearchIndexWriterImpl indexWriter;
|
||||
private static DictionaryWriter dictionaryWriter;
|
||||
|
||||
@AllArgsConstructor
|
||||
static class UploadJob {
|
||||
EdgeId<EdgeDomain> domainId;
|
||||
EdgeId<EdgeUrl> urlId;
|
||||
EdgePageWordSet words;
|
||||
int wordCount;
|
||||
}
|
||||
|
||||
static volatile boolean running = true;
|
||||
|
||||
public static void main(String... args) {
|
||||
org.mariadb.jdbc.Driver driver = new Driver();
|
||||
|
||||
dictionaryWriter = new DictionaryWriter(new File(args[0]), 1L << 30, true);
|
||||
indexWriter = new SearchIndexWriterImpl(dictionaryWriter, new File(args[1]));
|
||||
|
||||
new Thread(ConverterMain::uploadThread, "Uploader").start();
|
||||
|
||||
for (int i = 0; i < 24; i++) {
|
||||
new Thread(ConverterMain::processorThread, "Processor-"+i).start();
|
||||
}
|
||||
|
||||
conn = new DatabaseModule().provideConnection();
|
||||
|
||||
System.out.println("Loading URLs and domains");
|
||||
try (var c = conn.getConnection();
|
||||
var getUrlsStmt = c.prepareStatement("SELECT EC_URL.ID, DOMAIN_ID, PROTO, URL FROM EC_URL WHERE VISITED");
|
||||
var getDomainsStmt = c.prepareStatement("SELECT ID, URL_PART FROM EC_DOMAIN WHERE INDEXED>0")
|
||||
) {
|
||||
getUrlsStmt.setFetchSize(10_000);
|
||||
getDomainsStmt.setFetchSize(10_000);
|
||||
|
||||
System.out.println("Fetch domains");
|
||||
var domainRsp = getDomainsStmt.executeQuery();
|
||||
while (domainRsp.next()) {
|
||||
domainToIdMap.put(domainRsp.getString(2), domainRsp.getInt(1));
|
||||
idToDomainMap.put(domainRsp.getInt(1), domainRsp.getString(2));
|
||||
}
|
||||
|
||||
System.out.println("Fetch URLs");
|
||||
var urlRsp = getUrlsStmt.executeQuery();
|
||||
while (urlRsp.next()) {
|
||||
String urlStr = urlRsp.getString(3) + "://" + idToDomainMap.get(urlRsp.getInt(2)) + urlRsp.getString(4);
|
||||
urlToIdMap.put(urlStr, urlRsp.getInt(1));
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
|
||||
// new Thread(ConverterMain::uploadThread, "Uploader").start();
|
||||
//
|
||||
// for (int i = 0; i < 24; i++) {
|
||||
// new Thread(ConverterMain::processorThread, "Processor-"+i).start();
|
||||
// }
|
||||
|
||||
System.out.println("Loaded URLs and domains");
|
||||
|
||||
new ArchiveExtractor(Path.of(args[2])).forEach(
|
||||
page -> {
|
||||
if (page.contentType.contentType.startsWith("application/xhtml")
|
||||
|| page.contentType.contentType.startsWith("text/html")) {
|
||||
try {
|
||||
int domainId = domainToIdMap.get(page.url.domain.toString());
|
||||
if (domainId >= 0 && page.redirectUrl == null) {
|
||||
int urlId = urlToIdMap.get(page.url.toString());
|
||||
int dataHash = page.data.hashCode();
|
||||
try (var c = conn.getConnection();
|
||||
var updateHash = c.prepareStatement("UPDATE EC_URL SET DATA_HASH=? WHERE ID=?"))
|
||||
{
|
||||
updateHash.setInt(1, dataHash);
|
||||
updateHash.setInt(2, urlId);
|
||||
updateHash.executeUpdate();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
running = false;
|
||||
}
|
||||
|
||||
static final LanguageModels lm = new LanguageModels(
|
||||
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
|
||||
Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"),
|
||||
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
|
||||
Path.of("/var/lib/wmsa/model/English.RDR"),
|
||||
Path.of("/var/lib/wmsa/model/English.DICT"),
|
||||
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
|
||||
);
|
||||
static final NGramDict dict = new NGramDict(lm);
|
||||
|
||||
private static final LanguageFilter languageFilter = new LanguageFilter();
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
public static void processorThread() {
|
||||
SentenceExtractor newSe = new SentenceExtractor(lm);
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||
|
||||
try {
|
||||
while (running || !processQueue.isEmpty()) {
|
||||
var job = processQueue.take();
|
||||
if (job.data.length() > 512*1024) {
|
||||
System.out.println(job.url + " too big, skipping");
|
||||
}
|
||||
|
||||
var parsed = Jsoup.parse(job.data);
|
||||
var text = parsed.text();
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(text)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var dld = newSe.extractSentences(parsed.clone());
|
||||
var keywords = documentKeywordExtractor.extractKeywords(dld);
|
||||
int wc = dld.totalNumWords();
|
||||
|
||||
if (wc > 100) {
|
||||
double languageAgreement = languageFilter.dictionaryAgreement(dld);
|
||||
if (languageAgreement < 0.05) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
EdgeHtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(parsed.documentType());
|
||||
if (UNKNOWN.equals(htmlStandard)) {
|
||||
htmlStandard = HtmlStandardExtractor.sniffHtmlStandard(parsed);
|
||||
}
|
||||
|
||||
int scriptTags = getScriptPenalty(parsed);
|
||||
var featureSet = getFeatureSet(parsed, scriptTags, job.hasCookies);
|
||||
addTags(keywords, htmlStandard, job.url, featureSet);
|
||||
|
||||
extractLinkWords(keywords, job.getUrl(), parsed);
|
||||
|
||||
uploadQueue.put(new UploadJob(
|
||||
new EdgeId<>(domainToIdMap.get(job.url.domain.toString())),
|
||||
new EdgeId<>(urlToIdMap.get(job.url.toString())),
|
||||
keywords,
|
||||
0
|
||||
));
|
||||
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Map<EdgeUrl, Set<String>> extractLinkWords(EdgePageWordSet keywords, EdgeUrl pageUrl, Document parsed) {
|
||||
|
||||
List<Pair<EdgeUrl, String>> urls = new ArrayList<>();
|
||||
Set<String> linkKeywords = new HashSet<>();
|
||||
Map<EdgeUrl, Set<String>> linkTextWords = new ConcurrentHashMap<>();
|
||||
|
||||
for (var tag : parsed.getElementsByTag("a")) {
|
||||
if (!tag.hasAttr("href")) {
|
||||
continue;
|
||||
}
|
||||
if (urls.size() > 100) {
|
||||
break;
|
||||
}
|
||||
|
||||
var linkOpt = linkParser.parseLink(pageUrl, tag);
|
||||
if (linkOpt.isEmpty())
|
||||
continue;
|
||||
|
||||
var link = linkOpt.get();
|
||||
|
||||
urls.add(Pair.of(link, tag.text()));
|
||||
|
||||
if (!Objects.equals(link.domain.domain, pageUrl.domain.domain)
|
||||
&& linkKeywords.size() <= 25)
|
||||
{
|
||||
linkKeywords.add("links:" + link.domain.domain);
|
||||
}
|
||||
//
|
||||
// Set<String> words = new HashSet<>();
|
||||
//
|
||||
// for (var sent : sentenceExtractor.extractSentencesFromString(tag.text())) {
|
||||
// for (var keyword : keywordExtractor.getWordsFromSentence(sent)) {
|
||||
// words.add(sent.constructWordFromSpan(keyword));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// linkTextWords.compute(link, (k, set) -> {
|
||||
// if (set == null) return words;
|
||||
// else { set.addAll(words); return set; }
|
||||
// });
|
||||
|
||||
}
|
||||
|
||||
keywords.get(IndexBlock.Meta).addAll(linkKeywords);
|
||||
|
||||
if (WordPatterns.wordQualitiesPredicate.test(pageUrl.domain.domain.toLowerCase())) {
|
||||
keywords.get(IndexBlock.Link).addJust(pageUrl.domain.domain.toLowerCase());
|
||||
}
|
||||
|
||||
return linkTextWords;
|
||||
}
|
||||
|
||||
private static int getScriptPenalty(Document parsed) {
|
||||
var scriptTags = parsed.getElementsByTag("script");
|
||||
String scriptText = scriptTags.html();
|
||||
int badScript = 0;
|
||||
if (scriptText.contains(".createElement(")) {
|
||||
badScript = 1;
|
||||
}
|
||||
return scriptTags.size() + badScript + (scriptText.length())/1000;
|
||||
}
|
||||
|
||||
static final List<String> trackers = List.of("adform.net",
|
||||
"connect.facebook",
|
||||
"googletagmanager.com",
|
||||
"googlesyndication.com",
|
||||
"google.com",
|
||||
"twitter.com",
|
||||
"smartadserver.com",
|
||||
"doubleclick.com",
|
||||
"2mdn.com",
|
||||
"dmtry.com",
|
||||
"bing.com",
|
||||
"msn.com",
|
||||
"amazon-adsystem.com",
|
||||
"alexametrics.com",
|
||||
"rubiconproject.com",
|
||||
"chango.com",
|
||||
"d5nxst8fruw4z.cloudfront.net",
|
||||
"d31qbv1cthcecs.cloudfront.net",
|
||||
"linkedin.com");
|
||||
|
||||
private static Set<HtmlFeature> getFeatureSet(Document parsed, int scriptTags, boolean cookies) {
|
||||
Set<HtmlFeature> features = new HashSet<>();
|
||||
|
||||
if (scriptTags > 0) {
|
||||
features.add(HtmlFeature.JS);
|
||||
}
|
||||
if (!parsed.getElementsByTag("object").isEmpty()
|
||||
|| !parsed.getElementsByTag("audio").isEmpty()
|
||||
|| !parsed.getElementsByTag("video").isEmpty()) {
|
||||
features.add(HtmlFeature.MEDIA);
|
||||
}
|
||||
if (parsed.getElementsByTag("script").stream()
|
||||
.filter(tag -> tag.attr("src") != null)
|
||||
.anyMatch(tag -> trackers.stream().anyMatch(tracker -> tag.attr("src").contains(tracker)))) {
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
}
|
||||
if (parsed.getElementsByTag("script").html().contains("google-analytics.com")) {
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
}
|
||||
if (parsed.getElementsByTag("a").stream().map(e -> e.attr("href"))
|
||||
.filter(Objects::nonNull)
|
||||
.map(String::toLowerCase)
|
||||
.anyMatch(href ->
|
||||
href.contains("amzn.to/") || href.contains("amazon.com/"))) {
|
||||
features.add(HtmlFeature.AFFILIATE_LINK);
|
||||
}
|
||||
if (cookies) {
|
||||
features.add(HtmlFeature.COOKIES);
|
||||
}
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
private static void addTags(EdgePageWordSet wordSet, EdgeHtmlStandard htmlStandard, EdgeUrl url, Set<HtmlFeature> features) {
|
||||
List<String> tagWords = new ArrayList<>();
|
||||
tagWords.add("format:"+htmlStandard.toString().toLowerCase());
|
||||
tagWords.add("site:"+url.domain.toString().toLowerCase());
|
||||
tagWords.add("proto:"+url.proto.toLowerCase());
|
||||
tagWords.add("js:" + Boolean.toString(features.contains(HtmlFeature.JS)).toLowerCase());
|
||||
if (features.contains(HtmlFeature.MEDIA)) {
|
||||
tagWords.add("special:media");
|
||||
}
|
||||
if (features.contains(HtmlFeature.TRACKING)) {
|
||||
tagWords.add("special:tracking");
|
||||
}
|
||||
if (features.contains(HtmlFeature.AFFILIATE_LINK)) {
|
||||
tagWords.add("special:affiliate");
|
||||
}
|
||||
if (features.contains(HtmlFeature.COOKIES)) {
|
||||
tagWords.add("special:cookies");
|
||||
}
|
||||
wordSet.append(IndexBlock.Meta, tagWords);
|
||||
wordSet.append(IndexBlock.Words, tagWords);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public static void uploadThread() {
|
||||
|
||||
while (running || !processQueue.isEmpty() || !uploadQueue.isEmpty()) {
|
||||
var data = uploadQueue.take();
|
||||
|
||||
if (!data.words.isEmpty()) {
|
||||
for (var words : data.words.values()) {
|
||||
if (!words.getWords().isEmpty()) {
|
||||
if (words.size() < 1000) {
|
||||
indexWriter.put(data.domainId, data.urlId, words.block, words.words);
|
||||
} else {
|
||||
chunks(words.words, 1000).forEach(chunk -> {
|
||||
indexWriter.put(data.domainId, data.urlId, words.block, chunk);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Closing");
|
||||
dictionaryWriter.commitToDisk();
|
||||
indexWriter.forceWrite();
|
||||
dictionaryWriter.close();
|
||||
indexWriter.close();
|
||||
System.out.println("Done");
|
||||
}
|
||||
|
||||
private static <T> List<List<T>> chunks(Collection<T> coll, int size) {
|
||||
List<List<T>> ret = new ArrayList<>();
|
||||
List<T> data = List.copyOf(coll);
|
||||
|
||||
for (int i = 0; i < data.size(); i+=size) {
|
||||
ret.add(data.subList(i, Math.min(data.size(), i+size)));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner;
|
||||
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
import org.openzim.ZIMTypes.ZIMReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
public class EncyclopediaLoaderTool {
|
||||
|
||||
static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient();
|
||||
|
||||
public static void main(String[] args) throws IOException, InterruptedException {
|
||||
convertAll(args);
|
||||
encyclopediaClient.close();
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
private static void convertAll(String[] args) throws IOException, InterruptedException {
|
||||
var zr = new ZIMReader(new ZIMFile(args[0]));
|
||||
|
||||
var pool = Executors.newFixedThreadPool(8);
|
||||
var sem = new Semaphore(12);
|
||||
zr.forEachArticles((url, art) -> {
|
||||
if (art != null) {
|
||||
try {
|
||||
sem.acquire();
|
||||
|
||||
pool.execute(() -> {
|
||||
try {
|
||||
convert(url, art);
|
||||
} finally {
|
||||
sem.release();
|
||||
}
|
||||
});
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}, p -> true);
|
||||
|
||||
sem.acquire(12);
|
||||
|
||||
encyclopediaClient.close();
|
||||
}
|
||||
|
||||
private static void convert(String url, String art) {
|
||||
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art);
|
||||
|
||||
if (null != newData) {
|
||||
encyclopediaClient.submitWiki(Context.internal(), url, newData)
|
||||
.retry(5)
|
||||
.blockingSubscribe();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,142 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.archive.archiver.ArchiveExtractor;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
|
||||
import opennlp.tools.stemmer.PorterStemmer;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
public class TermFrequencyCounterMain {
|
||||
|
||||
static final LinkedBlockingQueue<EdgeRawPageContents> processQueue = new LinkedBlockingQueue<>(20);
|
||||
|
||||
public static final String OUTPUT_FILE = "/var/lib/wmsa/archive/tfreq-2022-04-04.bin";
|
||||
public static final String ARCHIVE_PATH = "/var/lib/wmsa/archive/webpage"; // "/mnt/storage/wmsa/archive/webpage/"
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) {
|
||||
|
||||
List<Thread> pt = new ArrayList<>();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
pt.add(new Thread(TermFrequencyCounterMain::processorThread));
|
||||
}
|
||||
pt.forEach(Thread::start);
|
||||
|
||||
AtomicLong docsTotal = new AtomicLong();
|
||||
new ArchiveExtractor(Path.of(ARCHIVE_PATH)).forEach(
|
||||
page -> {
|
||||
if (page.contentType.contentType.contains("html")
|
||||
&& page.isAfter("2022-03-15T")) {
|
||||
try {
|
||||
long dt = docsTotal.incrementAndGet();
|
||||
if (dt == 0) {
|
||||
System.out.println(docsTotal.get() + " - " + termFreq.size());
|
||||
}
|
||||
if ((dt % 5) != 0) {
|
||||
processQueue.put(page);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
});
|
||||
running = false;
|
||||
|
||||
|
||||
System.out.println("Waiting for wrap-up");
|
||||
|
||||
Thread.sleep(36000);
|
||||
|
||||
for (Thread thread : pt) {
|
||||
thread.interrupt();
|
||||
}
|
||||
for (Thread thread : pt) {
|
||||
thread.join();
|
||||
}
|
||||
System.out.println("Total documents = " + docsTotal.get());
|
||||
|
||||
System.out.println("Writing Frequencies");
|
||||
|
||||
try (var dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(OUTPUT_FILE)))
|
||||
) {
|
||||
synchronized (termFreq) {
|
||||
for (var entry : termFreq.entrySet()) {
|
||||
|
||||
if (entry.getValue() > 5) {
|
||||
dos.writeLong(entry.getKey());
|
||||
dos.writeLong(entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
System.out.println("All done!");
|
||||
}
|
||||
|
||||
public static final ConcurrentHashMap<Long, Integer> termFreq = new ConcurrentHashMap<>();
|
||||
|
||||
public static final LanguageModels lm = new LanguageModels(
|
||||
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
|
||||
Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"),
|
||||
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
|
||||
Path.of("/var/lib/wmsa/model/English.RDR"),
|
||||
Path.of("/var/lib/wmsa/model/English.DICT"),
|
||||
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
|
||||
);
|
||||
public static volatile boolean running = true;
|
||||
|
||||
public static void processorThread() {
|
||||
var ke = new KeywordExtractor();
|
||||
var se = new SentenceExtractor(lm);
|
||||
var ps = new PorterStemmer();
|
||||
try {
|
||||
TLongHashSet words = new TLongHashSet(10000);
|
||||
while (running || !processQueue.isEmpty()) {
|
||||
var job = processQueue.take();
|
||||
var sentence = se.extractSentences(Jsoup.parse(job.data));
|
||||
|
||||
for (var sent : sentence.sentences) {
|
||||
var keywords = ke.getKeywordsFromSentence(sent);
|
||||
for (int i = 0; i < keywords.length; i++) {
|
||||
if (keywords[i].size() > 1) {
|
||||
words.add(NGramDict.longHash(sent.constructStemmedWordFromSpan(keywords[i]).getBytes()));
|
||||
}
|
||||
}
|
||||
|
||||
for (String word : sent.wordsLowerCase) {
|
||||
words.add(NGramDict.longHash(ps.stem(word).getBytes()));
|
||||
}
|
||||
|
||||
words.forEach(l -> {
|
||||
termFreq.merge(l, 1, Integer::sum);
|
||||
return true;
|
||||
});
|
||||
words.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,211 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
import org.openzim.ZIMTypes.ZIMReader;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
public class ZimConverterMain {
|
||||
|
||||
static final LinkedBlockingQueue<ConversionJob> jobQueue = new LinkedBlockingQueue<>(100);
|
||||
static final LinkedBlockingQueue<String> analysisQueue = new LinkedBlockingQueue<>(100);
|
||||
static boolean hasData = true;
|
||||
static final ArchiveClient archiveClient = new ArchiveClient();
|
||||
static NGramDict dict = new NGramDict(new LanguageModels(
|
||||
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
|
||||
Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"),
|
||||
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
|
||||
Path.of("/var/lib/wmsa/model/English.RDR"),
|
||||
Path.of("/var/lib/wmsa/model/English.DICT"),
|
||||
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
|
||||
)
|
||||
);
|
||||
public void extractUrlList() throws IOException {
|
||||
var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
|
||||
|
||||
var urlList = zr.getURLListByURL();
|
||||
|
||||
try (PrintWriter pw = new PrintWriter(new FileOutputStream("/home/vlofgren/Work/wikiTitlesAndRedirects.sql"))) {
|
||||
zr.forEachTitles(
|
||||
ae -> {
|
||||
pw.printf("INSERT INTO REF_WIKI_TITLE(NAME) VALUES (\"%s\");\n", ae.getUrl().replace("\\", "\\\\").replace("\"", "\\\""));
|
||||
},
|
||||
re -> {
|
||||
pw.printf("INSERT INTO REF_WIKI_TITLE(NAME, REF_NAME) VALUES (\"%s\",\"%s\");\n", re.getUrl().replace("\\", "\\\\").replace("\"", "\\\""), urlList.get(re.getRedirectIndex()).replace("\\", "\\\\").replace("\"", "\\\""));
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
// convertJust("Aleph_number");
|
||||
// convertJust("Floyd–Steinberg_dithering");
|
||||
// convertJust("Laplace's_equation");
|
||||
// convertJust("John_Fahey");
|
||||
// convertJust("Plotinus");
|
||||
// convertJust("C++");
|
||||
convertAll(args);
|
||||
archiveClient.close();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static void convertJust(String url) {
|
||||
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url,
|
||||
Files.readString(Path.of("/home/vlofgren/Work/wiki-convert/", "in-" + url + ".html")));
|
||||
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData);
|
||||
}
|
||||
|
||||
private static void extractOne(String which, int clusterId) throws IOException {
|
||||
// var zr = new ZIMReader(new ZIMFile(args[1]));
|
||||
var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
|
||||
|
||||
int[] cluster = new int[] { clusterId };
|
||||
if (clusterId == -1) {
|
||||
zr.forEachTitles(ae -> {
|
||||
if (ae.getUrl().equals(which)) {
|
||||
System.err.print(ae.getUrl() + " " + ae.getClusterNumber());
|
||||
cluster[0] = ae.getClusterNumber();
|
||||
}
|
||||
}, re -> {
|
||||
});
|
||||
}
|
||||
|
||||
System.err.println("Extracting cluster " + cluster[0] );
|
||||
if (cluster[0] == -1) {
|
||||
return;
|
||||
}
|
||||
zr.forEachArticles((url, art) -> {
|
||||
if (art != null) {
|
||||
if (which.equals(url)) {
|
||||
try {
|
||||
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/","in-" + url + ".html"), art);
|
||||
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art);
|
||||
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
}
|
||||
scheduleJob(url, art);
|
||||
}
|
||||
}, p -> p == cluster[0]);
|
||||
|
||||
}
|
||||
|
||||
private static void convertAll(String[] args) throws IOException {
|
||||
archiveClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0]));
|
||||
var zr = new ZIMReader(new ZIMFile(args[1]));
|
||||
// var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
Thread t = new Thread(ZimConverterMain::jobExecutor);
|
||||
t.setName("Converter");
|
||||
t.start();
|
||||
|
||||
Thread t2 = new Thread(() -> {
|
||||
for (; ; ) {
|
||||
String pt;
|
||||
try {
|
||||
pt = analysisQueue.take();
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
return;
|
||||
}
|
||||
// var topic = new TopicWordExtractor().extractWords(pt);
|
||||
// var words = new NGramTextRankExtractor(dict, topic).extractWords(Collections.emptyList(), pt);
|
||||
// System.out.println(Strings.join(words, ','));
|
||||
}
|
||||
});
|
||||
t2.setName("Analysis");
|
||||
t2.start();
|
||||
}
|
||||
|
||||
zr.forEachArticles((url, art) -> {
|
||||
if (art != null) {
|
||||
scheduleJob(url, art);
|
||||
}
|
||||
}, p -> true);
|
||||
|
||||
hasData = false;
|
||||
archiveClient.close();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static void jobExecutor() {
|
||||
while (hasData || !jobQueue.isEmpty()) {
|
||||
var job = jobQueue.take();
|
||||
try {
|
||||
job.convert();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
System.err.println("Error in " + job.url);
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static void scheduleJob(String url, String art) {
|
||||
jobQueue.put(new ConversionJob(art, url));
|
||||
}
|
||||
|
||||
static final Map<Long, Integer> wordCount = new ConcurrentHashMap<>();
|
||||
static boolean isKeyword(String word) {
|
||||
|
||||
int limit = 100_000;
|
||||
long n = word.chars().filter(c -> c=='_').count();
|
||||
if (n == 0) limit = 2;
|
||||
if (n == 1) limit = 1;
|
||||
if (n == 2) limit = 1;
|
||||
if (n >= 3) limit = 1;
|
||||
|
||||
long c = word.chars().filter(ch -> ch >= 'a' && ch <= 'z').count();
|
||||
if (c-2 <= n) {
|
||||
return false;
|
||||
}
|
||||
int hashA = word.hashCode();
|
||||
int hashB = Objects.hash(n, c, word.length(), word.charAt(0));
|
||||
long hash = (long) hashA + ((long) hashB << 32);
|
||||
|
||||
return wordCount.compute(hash, (k, v) -> v == null ? 1 : v+1) == limit;
|
||||
}
|
||||
@AllArgsConstructor
|
||||
private static class ConversionJob {
|
||||
private final String data;
|
||||
private final String url;
|
||||
|
||||
|
||||
public void convert() throws InterruptedException {
|
||||
var page = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data);
|
||||
String pt = Jsoup.parse(page).text();
|
||||
analysisQueue.put(pt);
|
||||
|
||||
/*
|
||||
|
||||
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data);
|
||||
|
||||
|
||||
if (null != newData) {
|
||||
archiveClient.submitWiki(Context.internal(), url, newData)
|
||||
.retry(5)
|
||||
.blockingSubscribe();
|
||||
|
||||
}*/
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package nu.marginalia.wmsa.encyclopedia;
|
||||
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
||||
import nu.marginalia.wmsa.client.HttpStatusCode;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
|
||||
import okhttp3.MediaType;
|
||||
import org.eclipse.jetty.util.UrlEncoded;
|
||||
|
||||
import javax.annotation.CheckReturnValue;
|
||||
|
||||
public class EncyclopediaClient extends AbstractDynamicClient {
|
||||
public EncyclopediaClient() {
|
||||
super(ServiceDescriptor.ENCYCLOPEDIA);
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<HttpStatusCode> submitWiki(Context ctx, String url, String data) {
|
||||
return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8"));
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<Boolean> hasWiki(Context ctx, String url) {
|
||||
return super.get(ctx, "/wiki/has?url="+ UrlEncoded.encodeString(url), Boolean.class);
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<WikiArticles> encyclopediaLookup(Context ctx, String word) {
|
||||
return super.get(ctx,"/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,160 @@
|
|||
package nu.marginalia.wmsa.encyclopedia;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiSearchResult;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class EncyclopediaDao {
|
||||
|
||||
private HikariDataSource dataSource;
|
||||
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaDao.class);
|
||||
|
||||
@Inject
|
||||
public EncyclopediaDao(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public WikiArticles encyclopedia(String term) {
|
||||
WikiArticles response = new WikiArticles();
|
||||
response.entries = new ArrayList<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
var stmt = connection.prepareStatement("SELECT DISTINCT(NAME_LOWER) FROM REF_WIKI_TITLE WHERE NAME_LOWER=?");
|
||||
stmt.setString(1, term);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
response.entries.add(capitalizeWikiString(rsp.getString(1)));
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to fetch articles", ex);
|
||||
return new WikiArticles();
|
||||
}
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
public Optional<String> resolveEncylopediaRedirect(String term) {
|
||||
final List<String> matches = new ArrayList<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
if (term.equals(rsp.getString(1))
|
||||
|| rsp.getString(2) == null) {
|
||||
return Optional.ofNullable(rsp.getString(2));
|
||||
} else {
|
||||
matches.add(rsp.getString(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
if (!matches.isEmpty()) {
|
||||
return Optional.of(matches.get(0));
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public Optional<WikiSearchResult> findEncyclopediaPageDirect(String term) {
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term.replace(' ', '_'));
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
return Optional.of(new WikiSearchResult(name, null));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public List<WikiSearchResult> findEncyclopediaPages(String term) {
|
||||
final List<WikiSearchResult> directMatches = new ArrayList<>();
|
||||
final Set<WikiSearchResult> directSearchMatches = new HashSet<>();
|
||||
final Set<WikiSearchResult> indirectMatches = new HashSet<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term.replace(' ', '_'));
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
directMatches.add(new WikiSearchResult(name, null));
|
||||
} else {
|
||||
indirectMatches.add(new WikiSearchResult(name, refName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER LIKE ? LIMIT 10")) {
|
||||
stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%");
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
directSearchMatches.add(new WikiSearchResult(name, null));
|
||||
} else {
|
||||
indirectMatches.add(new WikiSearchResult(name, refName));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
directMatches.forEach(indirectMatches::remove);
|
||||
indirectMatches.removeAll(directSearchMatches);
|
||||
directMatches.forEach(directSearchMatches::remove);
|
||||
directMatches.addAll(indirectMatches);
|
||||
directMatches.addAll(directSearchMatches);
|
||||
return directMatches;
|
||||
}
|
||||
|
||||
private String capitalizeWikiString(String string) {
|
||||
if (string.contains("_")) {
|
||||
return Arrays.stream(string.split("_")).map(this::capitalizeWikiString).collect(Collectors.joining("_"));
|
||||
}
|
||||
if (string.length() < 2) {
|
||||
return string.toUpperCase();
|
||||
}
|
||||
return Character.toUpperCase(string.charAt(0)) + string.substring(1).toLowerCase();
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
package nu.marginalia.wmsa.encyclopedia;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.wmsa.configuration.MainClass;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
|
||||
public class EncyclopediaMain extends MainClass {
|
||||
private final EncyclopediaService service;
|
||||
|
||||
public static void main(String... args) {
|
||||
init(ServiceDescriptor.ENCYCLOPEDIA, args);
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new EncyclopediaModule(),
|
||||
new DatabaseModule(),
|
||||
new ConfigurationModule());
|
||||
injector.getInstance(EncyclopediaMain.class);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public EncyclopediaMain(EncyclopediaService service) {
|
||||
this.service = service;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
package nu.marginalia.wmsa.encyclopedia;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.name.Names;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class EncyclopediaModule extends AbstractModule {
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void configure() {
|
||||
bind(Path.class)
|
||||
.annotatedWith(Names.named("wiki-path"))
|
||||
.toInstance(WmsaHome.getDisk("encyclopedia"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,209 @@
|
|||
package nu.marginalia.wmsa.encyclopedia;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
public class EncyclopediaService extends Service {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaService.class);
|
||||
private final MustacheRenderer<String> wikiErrorPageRenderer;
|
||||
private final MustacheRenderer<Object> wikiSearchResultRenderer;
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
|
||||
private Path wikiPath;
|
||||
private EncyclopediaDao encyclopediaDao;
|
||||
|
||||
@Inject
|
||||
public EncyclopediaService(@Named("service-host") String ip,
|
||||
@Named("service-port") Integer port,
|
||||
@Named("wiki-path") Path wikiPath,
|
||||
EncyclopediaDao encyclopediaDao,
|
||||
RendererFactory rendererFactory,
|
||||
Initialization initialization,
|
||||
MetricsServer metricsServer)
|
||||
throws IOException {
|
||||
super(ip, port, initialization, metricsServer);
|
||||
this.wikiPath = wikiPath;
|
||||
this.encyclopediaDao = encyclopediaDao;
|
||||
|
||||
if (rendererFactory != null) {
|
||||
wikiErrorPageRenderer = rendererFactory.renderer("encyclopedia/wiki-error");
|
||||
wikiSearchResultRenderer = rendererFactory.renderer("encyclopedia/wiki-search");
|
||||
}
|
||||
else {
|
||||
wikiErrorPageRenderer = null;
|
||||
wikiSearchResultRenderer = null;
|
||||
}
|
||||
|
||||
|
||||
Spark.get("/public/wiki/*", this::getWikiPage);
|
||||
Spark.get("/public/wiki-search", this::searchWikiPage);
|
||||
|
||||
Spark.get("/wiki/has", this::pathWikiHas);
|
||||
Spark.post("/wiki/submit", this::pathWikiSubmit);
|
||||
Spark.get("/encyclopedia/:term", (rq, rsp) -> encyclopediaDao.encyclopedia(rq.params("term")), gson::toJson);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object getWikiPage(Request req, Response rsp) {
|
||||
final String[] splats = req.splat();
|
||||
if (splats.length == 0)
|
||||
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
|
||||
|
||||
|
||||
final String name = splats[0];
|
||||
|
||||
String pageName = encyclopediaDao.resolveEncylopediaRedirect(name).orElse(name);
|
||||
|
||||
logger.info("Resolved {} -> {}", name, pageName);
|
||||
|
||||
return wikiGet(pageName)
|
||||
.or(() -> resolveWikiPageNameWrongCase(name))
|
||||
.orElseGet(() -> renderSearchPage(name));
|
||||
}
|
||||
|
||||
private Optional<String> resolveWikiPageNameWrongCase(String name) {
|
||||
var rsp = encyclopediaDao.findEncyclopediaPageDirect(name);
|
||||
|
||||
if (rsp.isEmpty()) {
|
||||
return Optional.of(renderSearchPage(name));
|
||||
}
|
||||
|
||||
name = rsp.get().getInternalName();
|
||||
return wikiGet(name);
|
||||
}
|
||||
|
||||
private String renderSearchPage(String s) {
|
||||
return wikiSearchResultRenderer.render(
|
||||
Map.of("query", s,
|
||||
"error", "true",
|
||||
"results", encyclopediaDao.findEncyclopediaPages(s)));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object searchWikiPage(Request req, Response rsp) {
|
||||
final var ctx = Context.fromRequest(req);
|
||||
|
||||
String term = req.queryParams("query");
|
||||
if (null == term) {
|
||||
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
|
||||
return "";
|
||||
}
|
||||
|
||||
return wikiSearchResultRenderer.render(
|
||||
Map.of("query", term,
|
||||
"results",
|
||||
encyclopediaDao.findEncyclopediaPages(term))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
private Path getWikiFilename(Path base, String url) {
|
||||
Path p = base;
|
||||
|
||||
int urlHash = url.hashCode();
|
||||
|
||||
p = p.resolve(Integer.toString(urlHash & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>8) & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>16) & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>24) & 0xFF));
|
||||
|
||||
String fileName = url.chars()
|
||||
.mapToObj(this::encodeUrlChar)
|
||||
.collect(Collectors.joining());
|
||||
|
||||
if (fileName.length() > 128) {
|
||||
fileName = fileName.substring(0, 128) + (((long)urlHash)&0xFFFFFFFFL);
|
||||
}
|
||||
|
||||
return p.resolve(fileName + ".gz");
|
||||
}
|
||||
|
||||
|
||||
private String encodeUrlChar(int i) {
|
||||
if (i >= 'a' && i <= 'z') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i >= 'A' && i <= 'Z') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i >= '0' && i <= '9') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i == '.') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
else {
|
||||
return String.format("%%%2X", i);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object pathWikiHas(Request request, Response response) {
|
||||
return Files.exists(getWikiFilename(wikiPath, request.queryParams("url")));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Optional<String> wikiGet(String name) {
|
||||
|
||||
var filename = getWikiFilename(wikiPath, name);
|
||||
|
||||
if (Files.exists(filename)) {
|
||||
try (var stream = new GZIPInputStream(new FileInputStream(filename.toFile()))) {
|
||||
return Optional.of(new String(stream.readAllBytes()));
|
||||
}
|
||||
} else {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Object pathWikiSubmit(Request request, Response response) {
|
||||
byte[] data = request.bodyAsBytes();
|
||||
|
||||
String wikiUrl = request.queryParams("url");
|
||||
Path filename = getWikiFilename(wikiPath, wikiUrl);
|
||||
|
||||
Files.createDirectories(filename.getParent());
|
||||
|
||||
logger.debug("Writing {} to {}", wikiUrl, filename);
|
||||
|
||||
try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) {
|
||||
gos.write(data);
|
||||
gos.flush();
|
||||
}
|
||||
|
||||
return "ok";
|
||||
|
||||
}
|
||||
}
|
|
@ -254,4 +254,29 @@ CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, IND
|
|||
|
||||
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
|
||||
CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE);
|
||||
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
|
||||
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
|
||||
|
||||
---;
|
||||
|
||||
DROP TABLE IF EXISTS REF_DICTIONARY;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS REF_DICTIONARY(
|
||||
TYPE VARCHAR(16),
|
||||
WORD VARCHAR(255),
|
||||
DEFINITION VARCHAR(255)
|
||||
)
|
||||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS REF_DICTIONARY_WORD ON REF_DICTIONARY (WORD);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE (
|
||||
NAME VARCHAR(255),
|
||||
NAME_LOWER VARCHAR(255) GENERATED ALWAYS AS (LOWER(NAME)),
|
||||
REF_NAME VARCHAR(255)
|
||||
)
|
||||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER);
|
||||
CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME);
|
|
@ -18,6 +18,5 @@ CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE(
|
|||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
|
||||
CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER);
|
||||
CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME);
|
|
@ -50,8 +50,6 @@ class ServiceTest {
|
|||
new DictionaryService(dataSource, new SpellChecker()),
|
||||
new MathParser(),
|
||||
new Units(new MathParser()),
|
||||
null,
|
||||
null,
|
||||
new ScreenshotService(null), null);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
|
|
|
@ -1,72 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.archive;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.edge.archive.archiver.Archiver;
|
||||
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||
import spark.Spark;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static nu.marginalia.util.TestUtil.getPort;
|
||||
import static nu.marginalia.util.test.TestUtil.clearTempDir;
|
||||
|
||||
@Execution(ExecutionMode.SAME_THREAD)
|
||||
public class ArchiveTest {
|
||||
static EdgeArchiveService service;
|
||||
|
||||
static final int testPort = getPort();
|
||||
private static Path tempPath;
|
||||
private static Path tempPath2;
|
||||
private static ArchiveClient archiveClient;
|
||||
private static Archiver archiver;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpClass() throws IOException {
|
||||
Spark.port(testPort);
|
||||
System.setProperty("service-name", "edge-archive");
|
||||
archiveClient = new ArchiveClient();
|
||||
archiveClient.setServiceRoute("127.0.0.1", testPort);
|
||||
|
||||
tempPath = Files.createTempDirectory("archiveTest");
|
||||
tempPath2 = Files.createTempDirectory("wikiTest");
|
||||
|
||||
archiver = new Archiver(tempPath, 10);
|
||||
service = new EdgeArchiveService("127.0.0.1", testPort,
|
||||
tempPath,
|
||||
archiver,
|
||||
new Initialization(), null);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDown() throws Exception {
|
||||
archiver.close();
|
||||
archiveClient.close();
|
||||
clearTempDir(tempPath);
|
||||
clearTempDir(tempPath2);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Test
|
||||
public void testWiki() {
|
||||
var url = "Plato_(Disambiguation)";
|
||||
|
||||
Assertions.assertFalse(archiveClient.hasWiki(Context.internal(), url).blockingFirst());
|
||||
|
||||
archiveClient.submitWiki(Context.internal(), url, "<h1>Hello</h1>").blockingFirst();
|
||||
Assertions.assertTrue(archiveClient.hasWiki(Context.internal(), url).blockingFirst());
|
||||
Assertions.assertEquals("<h1>Hello</h1>", archiveClient.getWiki(Context.internal(), url).blockingFirst());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,17 +0,0 @@
|
|||
package nu.marginalia.wmsa.edge.archive.archiver;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ArchiverTest {
|
||||
|
||||
@Test
|
||||
public void testArchiver() throws Exception {
|
||||
Archiver archiver = new Archiver(Path.of("/tmp/"), 3);
|
||||
archiver.writeData(new ArchivedFile("file1", "Hey".getBytes()));
|
||||
archiver.writeData(new ArchivedFile("file2", "Hey".getBytes()));
|
||||
archiver.writeData(new ArchivedFile("file3", "Hey".getBytes()));
|
||||
archiver.writeData(new ArchivedFile("file4", "Hey".getBytes()));
|
||||
archiver.close();
|
||||
}
|
||||
}
|
|
@ -60,7 +60,6 @@ class AssistantTest {
|
|||
new DictionaryService(dataSource, new SpellChecker()),
|
||||
new MathParser(),
|
||||
new Units(new MathParser()),
|
||||
null, null,
|
||||
new ScreenshotService(null), null);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
|
@ -77,12 +76,6 @@ class AssistantTest {
|
|||
Spark.awaitStop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEncyclopedia() {
|
||||
var result = client.encyclopediaLookup(Context.internal(), "plato").blockingFirst();
|
||||
System.out.println(result);
|
||||
assertTrue(result.entries.size() >= 1);
|
||||
}
|
||||
@Test
|
||||
public void testSpellCheck() {
|
||||
var result = client.spellCheck(Context.internal(), "plato").blockingFirst();
|
||||
|
|
Loading…
Reference in a new issue