Merge branch 'encyclopedia-service' into master

This commit is contained in:
Viktor Lofgren 2022-05-30 12:41:30 +02:00
commit 75c4986532
37 changed files with 878 additions and 1601 deletions

View file

@ -0,0 +1,70 @@
package nu.marginalia.wmsa.edge;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import org.slf4j.LoggerFactory;
import org.testcontainers.containers.BindMode;
import org.testcontainers.containers.GenericContainer;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.containers.Network;
import org.testcontainers.containers.output.Slf4jLogConsumer;
import org.testcontainers.containers.wait.strategy.Wait;
import org.testcontainers.utility.MountableFile;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
public abstract class E2ETestBase {
public Network network = Network.newNetwork();
public MariaDBContainer<?> getMariaDBContainer() {
return new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withInitScript("sql/edge-crawler-cache.sql")
.withNetwork(network)
.withNetworkAliases("mariadb");
}
public GenericContainer<?> forService(ServiceDescriptor service, GenericContainer<?> mariaDB) {
return new GenericContainer<>("openjdk:17-alpine")
.dependsOn(mariaDB)
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
.withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh")
.withExposedPorts(service.port)
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
.withNetwork(network)
.withNetworkAliases(service.name)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name)))
.withCommand("sh", "init.sh", service.name)
.waitingFor(Wait.forHttp("/internal/ping")
.forPort(service.port)
.withReadTimeout(Duration.ofSeconds(15)))
;
}
public static MountableFile jarFile() {
Path cwd = Path.of(System.getProperty("user.dir"));
cwd = cwd.resolve("..");
var jarFile = cwd.resolve("build/libs/wmsa-SNAPSHOT-all.jar");
if (!Files.exists(jarFile)) {
System.err.println("Could not find jarFile " + jarFile);
throw new RuntimeException();
}
else {
System.out.println("jar file = " + jarFile);
}
return MountableFile.forHostPath(jarFile);
}
public static String modelsPath() {
Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models");
if (!Files.isDirectory(modelsPath)) {
System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath());
throw new RuntimeException();
}
return modelsPath.toString();
}
}

View file

@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge;
import nu.marginalia.util.test.TestUtil;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Tag;
@ -19,7 +18,6 @@ import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.MountableFile;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
@ -28,28 +26,19 @@ import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
import static org.testcontainers.containers.BrowserWebDriverContainer.VncRecordingMode.RECORD_ALL;
@Tag("e2e")
@Testcontainers
public class EdgeSearchE2ETest {
Network network = Network.newNetwork();
public class EdgeSearchE2ETest extends E2ETestBase {
@Container
public GenericContainer<?> mariaDB = getMariaDBContainer();
@Container
public GenericContainer<?> mariaDB = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withInitScript("sql/edge-crawler-cache.sql")
.withNetwork(network)
.withNetworkAliases("mariadb");
public GenericContainer<?> searchContainer = forService(EDGE_SEARCH, mariaDB);
@Container
public GenericContainer<?> searchContainer = forService(EDGE_SEARCH);
public GenericContainer<?> assistantContainer = forService(EDGE_ASSISTANT, mariaDB);
@Container
public GenericContainer<?> assistantContainer = forService(EDGE_ASSISTANT);
@Container
public GenericContainer<?> indexContainer = forService(EDGE_INDEX);
public GenericContainer<?> indexContainer = forService(EDGE_INDEX, mariaDB);
@Container
public NginxContainer<?> mockWikipedia = new NginxContainer<>("nginx:stable")
@ -88,46 +77,7 @@ public class EdgeSearchE2ETest {
.withNetwork(network)
.withNetworkAliases("proxyNginx");
;
public GenericContainer<?> forService(ServiceDescriptor service) {
return new GenericContainer<>("openjdk:17-alpine")
.dependsOn(mariaDB)
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
.withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh")
.withExposedPorts(service.port)
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
.withNetwork(network)
.withNetworkAliases(service.name)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name)))
.withCommand("sh", "init.sh", service.name)
.waitingFor(Wait.forHttp("/internal/ping")
.forPort(service.port)
.withReadTimeout(Duration.ofSeconds(15)))
;
}
public static MountableFile jarFile() {
Path cwd = Path.of(System.getProperty("user.dir"));
cwd = cwd.resolve("..");
var jarFile = cwd.resolve("build/libs/wmsa-SNAPSHOT-all.jar");
if (!Files.exists(jarFile)) {
System.err.println("Could not find jarFile " + jarFile);
throw new RuntimeException();
}
else {
System.out.println("jar file = " + jarFile);
}
return MountableFile.forHostPath(jarFile);
}
public static String modelsPath() {
Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models");
if (!Files.isDirectory(modelsPath)) {
System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath());
throw new RuntimeException();
}
return modelsPath.toString();
}
public static MountableFile ipDatabasePath() {
Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models/IP2LOC/IP2LOCATION-LITE-DB1.CSV");
if (!Files.isRegularFile(modelsPath)) {

View file

@ -0,0 +1,151 @@
package nu.marginalia.wmsa.edge;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.mariadb.jdbc.Driver;
import org.openqa.selenium.By;
import org.openqa.selenium.chrome.ChromeOptions;
import org.slf4j.LoggerFactory;
import org.testcontainers.containers.*;
import org.testcontainers.containers.output.Slf4jLogConsumer;
import org.testcontainers.containers.wait.strategy.Wait;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.MountableFile;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Path;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Types;
import java.time.Duration;
import java.util.concurrent.TimeUnit;
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@Tag("e2e")
@Testcontainers
public class EncyclopediaE2ETest extends E2ETestBase {
@Container
public MariaDBContainer<?> mariaDB = getMariaDBContainer();
@Container
public GenericContainer<?> encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB);
@Container
public GenericContainer<?> encyclopediaLoader = new GenericContainer<>("openjdk:17")
.dependsOn(encyclopediaContainer)
.dependsOn(mariaDB)
.withNetwork(network)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("encyclopedia-loader")))
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
.withCopyFileToContainer(MountableFile.forClasspathResource("load-encyclopedia.sh"), "/load-encyclopedia.sh")
.withFileSystemBind(getModelData().toString(), "/data", BindMode.READ_ONLY)
.withCommand("sh", "load-encyclopedia.sh")
.waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10)));
@Container
public NginxContainer<?> proxyNginx = new NginxContainer<>("nginx:stable")
.dependsOn(encyclopediaLoader)
.dependsOn(encyclopediaContainer)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx")))
.withCopyFileToContainer(MountableFile.forClasspathResource("nginx/encyclopedia.conf"), "/etc/nginx/conf.d/default.conf")
.withNetwork(network)
.withNetworkAliases("proxyNginx");
@Container
public BrowserWebDriverContainer<?> chrome = new BrowserWebDriverContainer<>()
.withNetwork(network)
.withCapabilities(new ChromeOptions());
private Gson gson = new GsonBuilder().create();
private OkHttpClient httpClient = new OkHttpClient.Builder()
.connectTimeout(100, TimeUnit.MILLISECONDS)
.readTimeout(6000, TimeUnit.SECONDS)
.retryOnConnectionFailure(true)
.followRedirects(true)
.build();
private Path getModelData() {
return Path.of(System.getProperty("user.dir")).resolve("data/test");
}
@Test
public void run() throws MalformedURLException {
new Driver();
try (var conn = DriverManager.getConnection(mariaDB.getJdbcUrl(), "wmsa", "wmsa");
var stmt = conn.prepareStatement("INSERT IGNORE INTO REF_WIKI_TITLE(NAME,REF_NAME) VALUES (?,?)")) {
stmt.setString(1, "Forg");
stmt.setString(2, "Frog");
stmt.executeUpdate();
stmt.setString(1, "Frog");
stmt.setNull(2, Types.VARCHAR);
stmt.executeUpdate();
} catch (SQLException e) {
throw new RuntimeException(e);
}
var driver = chrome.getWebDriver();
driver.get("http://proxyNginx/wiki/Frog");
System.out.println(driver.getTitle());
driver.get("http://proxyNginx/wiki-search?query=Forg");
System.out.println(driver.getTitle());
assertTrue(get(encyclopediaContainer.getHost(),
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
"/wiki/has?url=Frog", Boolean.class));
assertFalse(get(encyclopediaContainer.getHost(),
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
"/wiki/has?url=Marginalia", Boolean.class));
assertFalse(get(encyclopediaContainer.getHost(),
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
"/wiki/has?url=Marginalia", Boolean.class));
var resultsForMarginalia = get(encyclopediaContainer.getHost(),
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
"/encyclopedia/Marginalia", WikiArticles.class);
Assertions.assertTrue(resultsForMarginalia.getEntries().isEmpty());
var resultsForFrog = get(encyclopediaContainer.getHost(),
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
"/encyclopedia/Frog", WikiArticles.class);
Assertions.assertFalse(resultsForFrog.getEntries().isEmpty());
var resultsForFoRg = get(encyclopediaContainer.getHost(),
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
"/encyclopedia/Forg", WikiArticles.class);
Assertions.assertFalse(resultsForFoRg.getEntries().isEmpty());
}
private <T> T get(String host, Integer mappedPort, String path, Class<T> clazz) throws MalformedURLException {
var req = new Request.Builder().get().url(new URL("http", host, mappedPort, path)).build();
var call = httpClient.newCall(req);
try (var rsp = call.execute()) {
return gson.fromJson(rsp.body().charStream(), clazz);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}

View file

@ -3,7 +3,7 @@
mkdir -p /var/lib/wmsa/conf/
mkdir -p /var/lib/wmsa/data/
cat > /var/lib/wmsa/db.properties <<EOF
cat > /var/lib/wmsa/conf/db.properties <<EOF
db.user=wmsa
db.pass=wmsa
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true

View file

@ -1,5 +1,6 @@
#!/bin/bash
mkdir -p /var/lib/wmsa/encyclopedia
mkdir -p /var/lib/wmsa/conf
mkdir -p /var/lib/wmsa/index/write
mkdir -p /var/lib/wmsa/index/read
@ -21,7 +22,11 @@ many
year
EOF
cat > /var/lib/wmsa/db.properties <<EOF
cat > /var/lib/wmsa/conf/disks.properties <<EOF
encyclopedia=/var/lib/wmsa/encyclopedia
EOF
cat > /var/lib/wmsa/conf/db.properties <<EOF
db.user=wmsa
db.pass=wmsa
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
@ -51,7 +56,7 @@ smhi-scraper smhi-scraper
podcast-scraper podcast-scraper
edge-index edge-index
edge-search edge-search
edge-archive edge-archive
encyclopedia encyclopedia
edge-assistant edge-assistant
memex memex
dating dating

View file

@ -0,0 +1,32 @@
#!/bin/bash
mkdir -p /var/lib/wmsa/conf/
mkdir -p /var/lib/wmsa/data/
mkdir -p /data
cat > /var/lib/wmsa/conf/db.properties <<EOF
db.user=wmsa
db.pass=wmsa
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
EOF
cat > /var/lib/wmsa/conf/hosts <<EOF
# service-name host-name
resource-store resource-store
renderer renderer
auth auth
api api
smhi-scraper smhi-scraper
podcast-scraper podcast-scraper
edge-index edge-index
edge-search edge-search
encyclopedia encyclopedia
edge-assistant edge-assistant
memex memex
dating dating
EOF
java -cp WMSA.jar nu.marginalia.wmsa.edge.tools.EncyclopediaLoaderTool data/wikipedia_en_100_nopic.zim
echo "ALL DONE"

View file

@ -0,0 +1,40 @@
server {
listen 80;
listen [::]:80;
server_name nginx;
location /wiki/ {
rewrite ^ $request_uri;
rewrite ^/(.*) /public/$1 break;
return 400;
proxy_pass http://encyclopedia:5040$uri;
proxy_set_header X-Context $remote_addr-$connection;
proxy_set_header X-Public "1";
proxy_set_header X-Extern-Url $scheme://$host$request_uri;
proxy_set_header X-Extern-Domain $scheme://$host;
proxy_set_header X-User-Agent $http_user_agent;
tcp_nodelay on;
}
location /wiki-search {
rewrite ^ $request_uri;
rewrite ^/(.*) /public/$1 break;
return 400;
proxy_pass http://encyclopedia:5040$uri;
proxy_set_header X-Context $remote_addr-$connection;
proxy_set_header X-Public "1";
proxy_set_header X-Extern-Url $scheme://$host$request_uri;
proxy_set_header X-Extern-Domain $scheme://$host;
proxy_set_header X-User-Agent $http_user_agent;
tcp_nodelay on;
}
location / {
proxy_pass http://encyclopedia:5040/;
tcp_nodelay on;
}
}

View file

@ -6,11 +6,11 @@ import nu.marginalia.wmsa.configuration.command.Command;
import nu.marginalia.wmsa.configuration.command.ListCommand;
import nu.marginalia.wmsa.configuration.command.StartCommand;
import nu.marginalia.wmsa.configuration.command.VersionCommand;
import nu.marginalia.wmsa.edge.archive.EdgeArchiveMain;
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
import nu.marginalia.wmsa.edge.dating.DatingMain;
import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
import nu.marginalia.wmsa.edge.search.EdgeSearchMain;
import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain;
import nu.marginalia.wmsa.memex.MemexMain;
import nu.marginalia.wmsa.podcasts.PodcastScraperMain;
import nu.marginalia.wmsa.renderer.RendererMain;
@ -33,11 +33,12 @@ public enum ServiceDescriptor {
EDGE_INDEX("edge-index", 5021, EdgeIndexMain.class),
EDGE_SEARCH("edge-search", 5023, EdgeSearchMain.class),
EDGE_ARCHIVE("edge-archive", 5024, EdgeArchiveMain.class),
EDGE_ASSISTANT("edge-assistant", 5025, EdgeAssistantMain.class),
EDGE_MEMEX("memex", 5030, MemexMain.class),
ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class),
DATING("dating", 5070, DatingMain.class),
TEST_1("test-1", 0, null),

View file

@ -3,6 +3,7 @@ package nu.marginalia.wmsa.configuration;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Properties;
public class WmsaHome {
private static final String DEFAULT = "/var/lib/wmsa";
@ -32,4 +33,27 @@ public class WmsaHome {
public static Path getIPLocationDatabse() {
return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV");
}
public static Path getDisk(String name) throws IOException {
Path p = Path.of(getDiskProperties().getProperty(name));
if (!Files.isDirectory(p)) {
throw new IOException(name + " does not exist!");
}
return p;
}
public static Properties getDiskProperties() throws IOException {
Path settingsFile = getHomePath().resolve("conf/disks.properties");
if (Files.isRegularFile(settingsFile)) {
try (var is = Files.newInputStream(settingsFile)) {
var props = new Properties();
props.load(is);
return props;
}
}
else {
throw new IOException("Could not find disk settings " + settingsFile);
}
}
}

View file

@ -35,7 +35,7 @@ public class DatabaseModule extends AbstractModule {
}
private Properties loadDbProperties() {
Path propDir = WmsaHome.getHomePath().resolve("db.properties");
Path propDir = WmsaHome.getHomePath().resolve("conf/db.properties");
if (!Files.isRegularFile(propDir)) {
throw new IllegalStateException("Database properties file " + propDir + " does not exist");
}

View file

@ -1,33 +0,0 @@
package nu.marginalia.wmsa.edge.archive;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.wmsa.configuration.MainClass;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.configuration.server.Initialization;
public class EdgeArchiveMain extends MainClass {
private final EdgeArchiveService service;
@Inject
public EdgeArchiveMain(EdgeArchiveService service) {
this.service = service;
}
public static void main(String... args) {
init(ServiceDescriptor.EDGE_ARCHIVE, args);
Injector injector = Guice.createInjector(
new EdgeArchiveModule(),
new ConfigurationModule(),
new DatabaseModule()
);
injector.getInstance(EdgeArchiveMain.class);
injector.getInstance(Initialization.class).setReady();
}
}

View file

@ -1,15 +0,0 @@
package nu.marginalia.wmsa.edge.archive;
import com.google.inject.AbstractModule;
import com.google.inject.name.Names;
import java.nio.file.Path;
public class EdgeArchiveModule extends AbstractModule {
public void configure() {
bind(Path.class).annotatedWith(Names.named("archive-path")).toInstance(Path.of("/var/lib/wmsa/archive/webpage/"));
bind(Path.class).annotatedWith(Names.named("wiki-path")).toInstance(Path.of("/var/lib/wmsa/archive.fast/wiki/"));
bind(Integer.class).annotatedWith(Names.named("archive-size")).toInstance(10_000);
}
}

View file

@ -1,180 +0,0 @@
package nu.marginalia.wmsa.edge.archive;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import io.prometheus.client.Histogram;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.edge.archive.archiver.ArchivedFile;
import nu.marginalia.wmsa.edge.archive.archiver.Archiver;
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
public class EdgeArchiveService extends Service {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = new GsonBuilder().create();
private static final Histogram wmsa_archive_store_time = Histogram.build().name("wmsa_archive_store_time").help("-").register();
private static final Histogram wmsa_archive_fetch_time = Histogram.build().name("wmsa_archive_fetch_time").help("-").register();
private final Path wikiPath;
private final Archiver archiver;
@SneakyThrows
@Inject
public EdgeArchiveService(@Named("service-host") String ip,
@Named("service-port") Integer port,
@Named("wiki-path") Path wikiPath,
Archiver archiver,
Initialization initialization,
MetricsServer metricsServer)
{
super(ip, port, initialization, metricsServer);
this.wikiPath = wikiPath;
this.archiver = archiver;
Spark.staticFiles.expireTime(600);
Spark.post("/page/submit", this::pathPageSubmit);
Spark.post("/wiki/submit", this::pathWikiSubmit);
Spark.get("/wiki/has", this::pathWikiHas);
Spark.get("/wiki/get", this::pathWikiGet);
Spark.awaitInitialization();
}
@SneakyThrows
private Object pathPageSubmit(Request request, Response response) {
var timer = wmsa_archive_store_time.startTimer();
try {
var body = request.body();
var data = gson.fromJson(body, EdgeArchiveSubmissionReq.class);
String domainNamePart = data.getUrl().domain.domain.length() > 32 ? data.getUrl().domain.domain.substring(0, 32) : data.getUrl().domain.domain;
String fileName = String.format("%s-%10d", domainNamePart, data.getUrl().hashCode());
archiver.writeData(new ArchivedFile(fileName, body.getBytes()));
return "ok";
} finally {
timer.observeDuration();
}
}
@SneakyThrows
private Object pathWikiSubmit(Request request, Response response) {
var timer = wmsa_archive_store_time.startTimer();
try {
byte[] data = request.bodyAsBytes();
String wikiUrl = request.queryParams("url");
Path filename = getWikiFilename(wikiPath, wikiUrl);
Files.createDirectories(filename.getParent());
System.out.println(new String(data));
logger.debug("Writing {} to {}", wikiUrl, filename);
try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) {
gos.write(data);
gos.flush();
}
return "ok";
} finally {
timer.observeDuration();
}
}
private Path getWikiFilename(Path base, String url) {
Path p = base;
int urlHash = url.hashCode();
p = p.resolve(Integer.toString(urlHash & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>8) & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>16) & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>24) & 0xFF));
String fileName = url.chars()
.mapToObj(this::encodeUrlChar)
.collect(Collectors.joining());
if (fileName.length() > 128) {
fileName = fileName.substring(0, 128) + (((long)urlHash)&0xFFFFFFFFL);
}
return p.resolve(fileName + ".gz");
}
private String encodeUrlChar(int i) {
if (i >= 'a' && i <= 'z') {
return Character.toString(i);
}
if (i >= 'A' && i <= 'Z') {
return Character.toString(i);
}
if (i >= '0' && i <= '9') {
return Character.toString(i);
}
if (i == '.') {
return Character.toString(i);
}
else {
return String.format("%%%2X", i);
}
}
@SneakyThrows
private Object pathWikiHas(Request request, Response response) {
return Files.exists(getWikiFilename(wikiPath, request.queryParams("url")));
}
@SneakyThrows
private String pathWikiGet(Request request, Response response) {
var timer = wmsa_archive_fetch_time.startTimer();
try {
String url = request.queryParams("url");
var filename = getWikiFilename(wikiPath, url);
if (Files.exists(filename)) {
try (var stream = new GZIPInputStream(new FileInputStream(filename.toFile()))) {
return new String(stream.readAllBytes());
}
} else {
Spark.halt(404);
return null;
}
}
finally {
timer.observeDuration();
}
}
}

View file

@ -1,65 +0,0 @@
package nu.marginalia.wmsa.edge.archive.archiver;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.function.Consumer;
public class ArchiveExtractor {
private final Path archivePath;
private final String arhivePattern = "archive-%04d.tar.gz";
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = new GsonBuilder().create();
public ArchiveExtractor(Path archivePath) {
this.archivePath = archivePath;
}
public void forEach(Consumer<EdgeRawPageContents> contents) {
for (int i = 0; ; ++i) {
var fn = getArchiveFile(i);
logger.info("{}", fn);
if (!Files.exists(fn)) {
break;
}
try (var stream = new TarArchiveInputStream(new GzipCompressorInputStream(new BufferedInputStream(new FileInputStream(fn.toFile()))))) {
TarArchiveEntry entry;
while ((entry = stream.getNextTarEntry()) != null) {
if (entry.isFile()) {
try {
var obj = gson.fromJson(new InputStreamReader(stream), EdgeArchiveSubmissionReq.class);
if (obj != null) {
contents.accept(obj.getData());
}
}
catch (Exception ex) {
logger.error("Could not unpack {} - {} {}", entry.getName(), ex.getClass().getSimpleName(), ex.getMessage());
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
private Path getArchiveFile(int number) {
final String fileName = String.format(arhivePattern, number);
return archivePath.resolve(fileName);
}
}

View file

@ -1,5 +0,0 @@
package nu.marginalia.wmsa.edge.archive.archiver;
public record ArchivedFile(String filename,byte[] data ) {
}

View file

@ -1,113 +0,0 @@
package nu.marginalia.wmsa.edge.archive.archiver;
import com.google.inject.name.Named;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Singleton;
import java.io.ByteArrayInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.TimeUnit;
@Singleton
public class Archiver implements AutoCloseable {
private final Path archivePath;
private final int filesPerArchive;
private final String arhivePattern = "archive-%04d.tar.gz";
private final LinkedBlockingDeque<ArchivedFile> writeQueue = new LinkedBlockingDeque<>(10);
private final Thread writeThread;
private volatile int archiveNumber;
private volatile boolean running;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public Archiver(@Named("archive-path") Path archivePath, @Named("archive-size") Integer filesPerArchive) {
this.archivePath = archivePath;
this.filesPerArchive = filesPerArchive;
if (!Files.exists(archivePath)) {
throw new IllegalArgumentException("Archive path does not exist");
}
for (int i = 0;; ++i) {
if (!Files.exists(getArchiveFile(i))) {
archiveNumber = i;
break;
}
}
running = true;
writeThread = new Thread(this::writeThreadMain, "ArchiveWriteThread");
writeThread.start();
}
private Path getArchiveFile(int number) {
final String fileName = String.format(arhivePattern, number);
return archivePath.resolve(fileName);
}
public void writeData(ArchivedFile file) throws InterruptedException {
if (!running) throw new IllegalStateException("Archiver is closing or closed");
writeQueue.put(file);
}
private void writeThreadMain() {
try {
while (running || !writeQueue.isEmpty()) {
writeToFile(archiveNumber);
archiveNumber++;
}
running = false;
}
catch (Exception ex) {
logger.error("Uncaught exception in writer thread!!");
}
}
private void writeToFile(int archiveNumber) {
var archiveFile = getArchiveFile(archiveNumber);
logger.info("Switching to file {}", archiveFile);
try (TarArchiveOutputStream taos = new TarArchiveOutputStream(new GzipCompressorOutputStream(new FileOutputStream(archiveFile.toFile())))) {
for (int i = 0; i < filesPerArchive; i++) {
ArchivedFile writeJob = null;
while (writeJob == null) {
writeJob = writeQueue.poll(1, TimeUnit.SECONDS);
if (!running) return;
}
var entry = new TarArchiveEntry(String.format("%06d-%s", i, writeJob.filename()));
entry.setSize(writeJob.data().length);
taos.putArchiveEntry(entry);
logger.debug("Writing {} to {}", writeJob.filename(), archiveFile);
try (var bais = new ByteArrayInputStream(writeJob.data())) {
IOUtils.copy(bais, taos);
}
taos.closeArchiveEntry();
}
taos.finish();
logger.debug("Finishing {}", archiveFile);
} catch (Exception e) {
logger.error("Error", e);
}
}
@Override
public void close() throws Exception {
running = false;
writeThread.join();
}
}

View file

@ -1,56 +0,0 @@
package nu.marginalia.wmsa.edge.archive.client;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.wmsa.client.AbstractDynamicClient;
import nu.marginalia.wmsa.client.HttpStatusCode;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import okhttp3.MediaType;
import org.eclipse.jetty.util.UrlEncoded;
import javax.annotation.CheckReturnValue;
import java.util.concurrent.Semaphore;
@Singleton
public class ArchiveClient extends AbstractDynamicClient {
private final Semaphore submitPageSem = new Semaphore(3, true);
@Inject
public ArchiveClient() {
super(ServiceDescriptor.EDGE_ARCHIVE);
}
@CheckReturnValue
public void submitPage(Context ctx, EdgeUrl url, EdgeRawPageContents data) throws InterruptedException {
try {
submitPageSem.acquire();
super.post(ctx, "/page/submit", new EdgeArchiveSubmissionReq(url, data)).blockingSubscribe();
}
finally {
submitPageSem.release();
}
}
@CheckReturnValue
public Observable<HttpStatusCode> submitWiki(Context ctx, String url, String data) {
return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8"));
}
@CheckReturnValue
public Observable<Boolean> hasWiki(Context ctx, String url) {
return super.get(ctx, "/wiki/has?url="+UrlEncoded.encodeString(url), Boolean.class);
}
@CheckReturnValue
public Observable<String> getWiki(Context ctx, String url) {
return super.get(ctx, "/wiki/get?url="+UrlEncoded.encodeString(url));
}
}

View file

@ -1,13 +0,0 @@
package nu.marginalia.wmsa.edge.archive.request;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
@AllArgsConstructor @Getter @ToString
public class EdgeArchiveSubmissionReq {
EdgeUrl url;
EdgeRawPageContents data;
}

View file

@ -4,36 +4,27 @@ import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import io.reactivex.rxjava3.core.Observable;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.server.*;
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.edge.assistant.dict.DictionaryService;
import nu.marginalia.wmsa.edge.assistant.eval.MathParser;
import nu.marginalia.wmsa.edge.assistant.eval.Units;
import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
import nu.marginalia.wmsa.edge.assistant.suggest.Suggestions;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.util.Map;
public class EdgeAssistantService extends Service {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = new GsonBuilder().create();
private final Units units;
private final DictionaryService dictionaryService;
private final MathParser mathParser;
private final ArchiveClient archiveClient;
private final ScreenshotService screenshotService;
private final MustacheRenderer<String> wikiErrorPageRenderer;
private final MustacheRenderer<Object> wikiSearchResultRenderer;
private final Suggestions suggestions;
@SneakyThrows
@ -45,40 +36,22 @@ public class EdgeAssistantService extends Service {
DictionaryService dictionaryService,
MathParser mathParser,
Units units,
ArchiveClient archiveClient,
RendererFactory rendererFactory,
ScreenshotService screenshotService,
Suggestions suggestions
)
{
super(ip, port, initialization, metricsServer);
this.dictionaryService = dictionaryService;
this.mathParser = mathParser;
this.units = units;
this.archiveClient = archiveClient;
this.screenshotService = screenshotService;
this.suggestions = suggestions;
Spark.staticFiles.expireTime(600);
if (rendererFactory != null) {
wikiErrorPageRenderer = rendererFactory.renderer("encyclopedia/wiki-error");
wikiSearchResultRenderer = rendererFactory.renderer("encyclopedia/wiki-search");
}
else {
wikiErrorPageRenderer = null;
wikiSearchResultRenderer = null;
}
Spark.get("/public/wiki/*", this::getWikiPage);
Spark.get("/public/wiki-search", this::searchWikiPage);
Spark.get("/public/screenshot/:id", screenshotService::serveScreenshotRequest);
Spark.get("/screenshot/:id", screenshotService::serveScreenshotRequest);
Spark.get("/dictionary/:word", (req, rsp) -> dictionaryService.define(req.params("word")), this::convertToJson);
Spark.get("/spell-check/:term", (req, rsp) -> dictionaryService.spellCheck(req.params("term").toLowerCase()), this::convertToJson);
Spark.get("/encyclopedia/:term", (req, rsp) -> dictionaryService.encyclopedia(req.params("term")), this::convertToJson);
Spark.get("/unit-conversion", (req, rsp) -> unitConversion(
rsp,
req.queryParams("value"),
@ -106,57 +79,6 @@ public class EdgeAssistantService extends Service {
return suggestions.getSuggestions(10, param);
}
@SneakyThrows
private Object getWikiPage(Request req, Response rsp) {
final var ctx = Context.fromRequest(req);
final String[] splats = req.splat();
if (splats.length == 0)
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
final String s = splats[0];
String pageName = dictionaryService.resolveEncylopediaRedirect(s).orElse(s);
logger.info("Resolved {} -> {}", s, pageName);
return archiveClient.getWiki(ctx, pageName)
.onErrorResumeWith(resolveWikiPageNameWrongCase(ctx, s))
.blockingFirst();
}
private Observable<String> resolveWikiPageNameWrongCase(Context ctx, String s) {
var rsp = dictionaryService.findEncyclopediaPageDirect(s);
if (rsp.isEmpty()) {
return renderSearchPage(s);
}
return archiveClient.getWiki(ctx, rsp.get().getInternalName())
.onErrorResumeWith(renderSearchPage(s));
}
private Observable<String> renderSearchPage(String s) {
return Observable.fromCallable(() -> wikiSearchResultRenderer.render(
Map.of("query", s,
"error", "true",
"results", dictionaryService.findEncyclopediaPages(s))));
}
@SneakyThrows
private Object searchWikiPage(Request req, Response rsp) {
final var ctx = Context.fromRequest(req);
String term = req.queryParams("query");
if (null == term) {
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
return "";
}
return wikiSearchResultRenderer.render(
Map.of("query", term,
"results",
dictionaryService.findEncyclopediaPages(term))
);
}
private Object evalExpression(Response rsp, String value) {
try {
var val = mathParser.evalFormatted(value);

View file

@ -24,10 +24,6 @@ public class AssistantClient extends AbstractDynamicClient {
return super.get(ctx,"/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class);
}
public Observable<WikiArticles> encyclopediaLookup(Context ctx, String word) {
return super.get(ctx,"/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class);
}
@SuppressWarnings("unchecked")
public Observable<List<String>> spellCheck(Context ctx, String word) {
return (Observable<List<String>>) (Object) super.get(ctx,"/spell-check/" + UrlEncoded.encodeString(word), List.class);

View file

@ -43,142 +43,6 @@ public class DictionaryService {
return response;
}
public WikiArticles encyclopedia(String term) {
WikiArticles response = new WikiArticles();
response.entries = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
var stmt = connection.prepareStatement("SELECT DISTINCT(NAME_LOWER) FROM REF_WIKI_TITLE WHERE NAME_LOWER=?");
stmt.setString(1, term);
var rsp = stmt.executeQuery();
while (rsp.next()) {
response.entries.add(capitalizeWikiString(rsp.getString(1)));
}
}
catch (Exception ex) {
logger.error("Failed to fetch articles", ex);
return new WikiArticles();
}
return response;
}
public Optional<String> resolveEncylopediaRedirect(String term) {
final List<String> matches = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term);
var rsp = stmt.executeQuery();
while (rsp.next()) {
if (term.equals(rsp.getString(1))
|| rsp.getString(2) == null) {
return Optional.ofNullable(rsp.getString(2));
} else {
matches.add(rsp.getString(2));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
if (!matches.isEmpty()) {
return Optional.of(matches.get(0));
}
return Optional.empty();
}
public Optional<WikiSearchResult> findEncyclopediaPageDirect(String term) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term.replace(' ', '_'));
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
return Optional.of(new WikiSearchResult(name, null));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
return Optional.empty();
}
public List<WikiSearchResult> findEncyclopediaPages(String term) {
final List<WikiSearchResult> directMatches = new ArrayList<>();
final Set<WikiSearchResult> directSearchMatches = new HashSet<>();
final Set<WikiSearchResult> indirectMatches = new HashSet<>();
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term.replace(' ', '_'));
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
directMatches.add(new WikiSearchResult(name, null));
} else {
indirectMatches.add(new WikiSearchResult(name, refName));
}
}
}
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER LIKE ? LIMIT 10")) {
stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%");
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
directSearchMatches.add(new WikiSearchResult(name, null));
} else {
indirectMatches.add(new WikiSearchResult(name, refName));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
directMatches.forEach(indirectMatches::remove);
indirectMatches.removeAll(directSearchMatches);
directMatches.forEach(directSearchMatches::remove);
directMatches.addAll(indirectMatches);
directMatches.addAll(directSearchMatches);
return directMatches;
}
private String capitalizeWikiString(String string) {
if (string.contains("_")) {
return Arrays.stream(string.split("_")).map(this::capitalizeWikiString).collect(Collectors.joining("_"));
}
if (string.length() < 2) {
return string.toUpperCase();
}
return Character.toUpperCase(string.charAt(0)) + string.substring(1).toLowerCase();
}
public List<String> spellCheck(String word) {
return spellChecker.correct(word);
}

View file

@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.search.results.SearchResultValuator;
import nu.marginalia.wmsa.edge.search.results.model.AccumulatedQueryResults;
import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator;
import nu.marginalia.wmsa.edge.search.results.UrlDeduplicator;
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
import org.apache.logging.log4j.util.Strings;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
@ -33,6 +34,7 @@ public class EdgeSearchOperator {
private static final Logger logger = LoggerFactory.getLogger(EdgeSearchOperator.class);
private final AssistantClient assistantClient;
private final EncyclopediaClient encyclopediaClient;
private final EdgeDataStoreDao edgeDataStoreDao;
private final EdgeIndexClient indexClient;
private final QueryFactory queryFactory;
@ -42,6 +44,7 @@ public class EdgeSearchOperator {
@Inject
public EdgeSearchOperator(AssistantClient assistantClient,
EncyclopediaClient encyclopediaClient,
EdgeDataStoreDao edgeDataStoreDao,
EdgeIndexClient indexClient,
QueryFactory queryFactory,
@ -50,6 +53,7 @@ public class EdgeSearchOperator {
) {
this.assistantClient = assistantClient;
this.encyclopediaClient = encyclopediaClient;
this.edgeDataStoreDao = edgeDataStoreDao;
this.indexClient = indexClient;
this.queryFactory = queryFactory;
@ -220,7 +224,7 @@ public class EdgeSearchOperator {
@NotNull
private Observable<WikiArticles> getWikiArticle(Context ctx, String humanQuery) {
return assistantClient
return encyclopediaClient
.encyclopediaLookup(ctx,
humanQuery.replaceAll("\\s+", "_")
.replaceAll("\"", "")

View file

@ -1,384 +0,0 @@
package nu.marginalia.wmsa.edge.tools;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.map.hash.TObjectIntHashMap;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.archive.archiver.ArchiveExtractor;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlStandardExtractor;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl;
import nu.marginalia.wmsa.edge.model.*;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
import org.apache.commons.lang3.tuple.Pair;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.mariadb.jdbc.Driver;
import java.io.File;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.UNKNOWN;
public class ConverterMain {
static final LinkedBlockingQueue<EdgeRawPageContents> processQueue = new LinkedBlockingQueue<>(20);
static final LinkedBlockingQueue<UploadJob> uploadQueue = new LinkedBlockingQueue<>(2);
static final TObjectIntHashMap<String> urlToIdMap = new TObjectIntHashMap<>(50_000_000, 0.5f, -1);
static final TObjectIntHashMap<String> domainToIdMap = new TObjectIntHashMap<>(5_000_000, 0.5f, -1);
static final TIntObjectHashMap<String> idToDomainMap = new TIntObjectHashMap<>(5_000_000, 0.5f, -1);
static HikariDataSource conn;
private static SearchIndexWriterImpl indexWriter;
private static DictionaryWriter dictionaryWriter;
@AllArgsConstructor
static class UploadJob {
EdgeId<EdgeDomain> domainId;
EdgeId<EdgeUrl> urlId;
EdgePageWordSet words;
int wordCount;
}
static volatile boolean running = true;
public static void main(String... args) {
org.mariadb.jdbc.Driver driver = new Driver();
dictionaryWriter = new DictionaryWriter(new File(args[0]), 1L << 30, true);
indexWriter = new SearchIndexWriterImpl(dictionaryWriter, new File(args[1]));
new Thread(ConverterMain::uploadThread, "Uploader").start();
for (int i = 0; i < 24; i++) {
new Thread(ConverterMain::processorThread, "Processor-"+i).start();
}
conn = new DatabaseModule().provideConnection();
System.out.println("Loading URLs and domains");
try (var c = conn.getConnection();
var getUrlsStmt = c.prepareStatement("SELECT EC_URL.ID, DOMAIN_ID, PROTO, URL FROM EC_URL WHERE VISITED");
var getDomainsStmt = c.prepareStatement("SELECT ID, URL_PART FROM EC_DOMAIN WHERE INDEXED>0")
) {
getUrlsStmt.setFetchSize(10_000);
getDomainsStmt.setFetchSize(10_000);
System.out.println("Fetch domains");
var domainRsp = getDomainsStmt.executeQuery();
while (domainRsp.next()) {
domainToIdMap.put(domainRsp.getString(2), domainRsp.getInt(1));
idToDomainMap.put(domainRsp.getInt(1), domainRsp.getString(2));
}
System.out.println("Fetch URLs");
var urlRsp = getUrlsStmt.executeQuery();
while (urlRsp.next()) {
String urlStr = urlRsp.getString(3) + "://" + idToDomainMap.get(urlRsp.getInt(2)) + urlRsp.getString(4);
urlToIdMap.put(urlStr, urlRsp.getInt(1));
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
// new Thread(ConverterMain::uploadThread, "Uploader").start();
//
// for (int i = 0; i < 24; i++) {
// new Thread(ConverterMain::processorThread, "Processor-"+i).start();
// }
System.out.println("Loaded URLs and domains");
new ArchiveExtractor(Path.of(args[2])).forEach(
page -> {
if (page.contentType.contentType.startsWith("application/xhtml")
|| page.contentType.contentType.startsWith("text/html")) {
try {
int domainId = domainToIdMap.get(page.url.domain.toString());
if (domainId >= 0 && page.redirectUrl == null) {
int urlId = urlToIdMap.get(page.url.toString());
int dataHash = page.data.hashCode();
try (var c = conn.getConnection();
var updateHash = c.prepareStatement("UPDATE EC_URL SET DATA_HASH=? WHERE ID=?"))
{
updateHash.setInt(1, dataHash);
updateHash.setInt(2, urlId);
updateHash.executeUpdate();
}
catch (Exception ex) {
ex.printStackTrace();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
});
running = false;
}
static final LanguageModels lm = new LanguageModels(
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"),
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
Path.of("/var/lib/wmsa/model/English.RDR"),
Path.of("/var/lib/wmsa/model/English.DICT"),
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
);
static final NGramDict dict = new NGramDict(lm);
private static final LanguageFilter languageFilter = new LanguageFilter();
private static final LinkParser linkParser = new LinkParser();
public static void processorThread() {
SentenceExtractor newSe = new SentenceExtractor(lm);
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
try {
while (running || !processQueue.isEmpty()) {
var job = processQueue.take();
if (job.data.length() > 512*1024) {
System.out.println(job.url + " too big, skipping");
}
var parsed = Jsoup.parse(job.data);
var text = parsed.text();
if (languageFilter.isBlockedUnicodeRange(text)) {
continue;
}
var dld = newSe.extractSentences(parsed.clone());
var keywords = documentKeywordExtractor.extractKeywords(dld);
int wc = dld.totalNumWords();
if (wc > 100) {
double languageAgreement = languageFilter.dictionaryAgreement(dld);
if (languageAgreement < 0.05) {
continue;
}
}
EdgeHtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(parsed.documentType());
if (UNKNOWN.equals(htmlStandard)) {
htmlStandard = HtmlStandardExtractor.sniffHtmlStandard(parsed);
}
int scriptTags = getScriptPenalty(parsed);
var featureSet = getFeatureSet(parsed, scriptTags, job.hasCookies);
addTags(keywords, htmlStandard, job.url, featureSet);
extractLinkWords(keywords, job.getUrl(), parsed);
uploadQueue.put(new UploadJob(
new EdgeId<>(domainToIdMap.get(job.url.domain.toString())),
new EdgeId<>(urlToIdMap.get(job.url.toString())),
keywords,
0
));
}
}
catch (InterruptedException ex) {
ex.printStackTrace();
}
}
private static Map<EdgeUrl, Set<String>> extractLinkWords(EdgePageWordSet keywords, EdgeUrl pageUrl, Document parsed) {
List<Pair<EdgeUrl, String>> urls = new ArrayList<>();
Set<String> linkKeywords = new HashSet<>();
Map<EdgeUrl, Set<String>> linkTextWords = new ConcurrentHashMap<>();
for (var tag : parsed.getElementsByTag("a")) {
if (!tag.hasAttr("href")) {
continue;
}
if (urls.size() > 100) {
break;
}
var linkOpt = linkParser.parseLink(pageUrl, tag);
if (linkOpt.isEmpty())
continue;
var link = linkOpt.get();
urls.add(Pair.of(link, tag.text()));
if (!Objects.equals(link.domain.domain, pageUrl.domain.domain)
&& linkKeywords.size() <= 25)
{
linkKeywords.add("links:" + link.domain.domain);
}
//
// Set<String> words = new HashSet<>();
//
// for (var sent : sentenceExtractor.extractSentencesFromString(tag.text())) {
// for (var keyword : keywordExtractor.getWordsFromSentence(sent)) {
// words.add(sent.constructWordFromSpan(keyword));
// }
// }
//
// linkTextWords.compute(link, (k, set) -> {
// if (set == null) return words;
// else { set.addAll(words); return set; }
// });
}
keywords.get(IndexBlock.Meta).addAll(linkKeywords);
if (WordPatterns.wordQualitiesPredicate.test(pageUrl.domain.domain.toLowerCase())) {
keywords.get(IndexBlock.Link).addJust(pageUrl.domain.domain.toLowerCase());
}
return linkTextWords;
}
private static int getScriptPenalty(Document parsed) {
var scriptTags = parsed.getElementsByTag("script");
String scriptText = scriptTags.html();
int badScript = 0;
if (scriptText.contains(".createElement(")) {
badScript = 1;
}
return scriptTags.size() + badScript + (scriptText.length())/1000;
}
static final List<String> trackers = List.of("adform.net",
"connect.facebook",
"googletagmanager.com",
"googlesyndication.com",
"google.com",
"twitter.com",
"smartadserver.com",
"doubleclick.com",
"2mdn.com",
"dmtry.com",
"bing.com",
"msn.com",
"amazon-adsystem.com",
"alexametrics.com",
"rubiconproject.com",
"chango.com",
"d5nxst8fruw4z.cloudfront.net",
"d31qbv1cthcecs.cloudfront.net",
"linkedin.com");
private static Set<HtmlFeature> getFeatureSet(Document parsed, int scriptTags, boolean cookies) {
Set<HtmlFeature> features = new HashSet<>();
if (scriptTags > 0) {
features.add(HtmlFeature.JS);
}
if (!parsed.getElementsByTag("object").isEmpty()
|| !parsed.getElementsByTag("audio").isEmpty()
|| !parsed.getElementsByTag("video").isEmpty()) {
features.add(HtmlFeature.MEDIA);
}
if (parsed.getElementsByTag("script").stream()
.filter(tag -> tag.attr("src") != null)
.anyMatch(tag -> trackers.stream().anyMatch(tracker -> tag.attr("src").contains(tracker)))) {
features.add(HtmlFeature.TRACKING);
}
if (parsed.getElementsByTag("script").html().contains("google-analytics.com")) {
features.add(HtmlFeature.TRACKING);
}
if (parsed.getElementsByTag("a").stream().map(e -> e.attr("href"))
.filter(Objects::nonNull)
.map(String::toLowerCase)
.anyMatch(href ->
href.contains("amzn.to/") || href.contains("amazon.com/"))) {
features.add(HtmlFeature.AFFILIATE_LINK);
}
if (cookies) {
features.add(HtmlFeature.COOKIES);
}
return features;
}
private static void addTags(EdgePageWordSet wordSet, EdgeHtmlStandard htmlStandard, EdgeUrl url, Set<HtmlFeature> features) {
List<String> tagWords = new ArrayList<>();
tagWords.add("format:"+htmlStandard.toString().toLowerCase());
tagWords.add("site:"+url.domain.toString().toLowerCase());
tagWords.add("proto:"+url.proto.toLowerCase());
tagWords.add("js:" + Boolean.toString(features.contains(HtmlFeature.JS)).toLowerCase());
if (features.contains(HtmlFeature.MEDIA)) {
tagWords.add("special:media");
}
if (features.contains(HtmlFeature.TRACKING)) {
tagWords.add("special:tracking");
}
if (features.contains(HtmlFeature.AFFILIATE_LINK)) {
tagWords.add("special:affiliate");
}
if (features.contains(HtmlFeature.COOKIES)) {
tagWords.add("special:cookies");
}
wordSet.append(IndexBlock.Meta, tagWords);
wordSet.append(IndexBlock.Words, tagWords);
}
@SneakyThrows
public static void uploadThread() {
while (running || !processQueue.isEmpty() || !uploadQueue.isEmpty()) {
var data = uploadQueue.take();
if (!data.words.isEmpty()) {
for (var words : data.words.values()) {
if (!words.getWords().isEmpty()) {
if (words.size() < 1000) {
indexWriter.put(data.domainId, data.urlId, words.block, words.words);
} else {
chunks(words.words, 1000).forEach(chunk -> {
indexWriter.put(data.domainId, data.urlId, words.block, chunk);
});
}
}
}
}
}
System.out.println("Closing");
dictionaryWriter.commitToDisk();
indexWriter.forceWrite();
dictionaryWriter.close();
indexWriter.close();
System.out.println("Done");
}
private static <T> List<List<T>> chunks(Collection<T> coll, int size) {
List<List<T>> ret = new ArrayList<>();
List<T> data = List.copyOf(coll);
for (int i = 0; i < data.size(); i+=size) {
ret.add(data.subList(i, Math.min(data.size(), i+size)));
}
return ret;
}
}

View file

@ -0,0 +1,59 @@
package nu.marginalia.wmsa.edge.tools;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner;
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
import java.io.IOException;
import java.util.concurrent.*;
public class EncyclopediaLoaderTool {
static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient();
public static void main(String[] args) throws IOException, InterruptedException {
convertAll(args);
encyclopediaClient.close();
System.exit(0);
}
private static void convertAll(String[] args) throws IOException, InterruptedException {
var zr = new ZIMReader(new ZIMFile(args[0]));
var pool = Executors.newFixedThreadPool(8);
var sem = new Semaphore(12);
zr.forEachArticles((url, art) -> {
if (art != null) {
try {
sem.acquire();
pool.execute(() -> {
try {
convert(url, art);
} finally {
sem.release();
}
});
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}, p -> true);
sem.acquire(12);
encyclopediaClient.close();
}
private static void convert(String url, String art) {
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art);
if (null != newData) {
encyclopediaClient.submitWiki(Context.internal(), url, newData)
.retry(5)
.blockingSubscribe();
}
}
}

View file

@ -1,142 +0,0 @@
package nu.marginalia.wmsa.edge.tools;
import gnu.trove.set.hash.TLongHashSet;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.archive.archiver.ArchiveExtractor;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.KeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
import opennlp.tools.stemmer.PorterStemmer;
import org.jsoup.Jsoup;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Path;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicLong;
public class TermFrequencyCounterMain {
static final LinkedBlockingQueue<EdgeRawPageContents> processQueue = new LinkedBlockingQueue<>(20);
public static final String OUTPUT_FILE = "/var/lib/wmsa/archive/tfreq-2022-04-04.bin";
public static final String ARCHIVE_PATH = "/var/lib/wmsa/archive/webpage"; // "/mnt/storage/wmsa/archive/webpage/"
@SneakyThrows
public static void main(String... args) {
List<Thread> pt = new ArrayList<>();
for (int i = 0; i < 20; i++) {
pt.add(new Thread(TermFrequencyCounterMain::processorThread));
}
pt.forEach(Thread::start);
AtomicLong docsTotal = new AtomicLong();
new ArchiveExtractor(Path.of(ARCHIVE_PATH)).forEach(
page -> {
if (page.contentType.contentType.contains("html")
&& page.isAfter("2022-03-15T")) {
try {
long dt = docsTotal.incrementAndGet();
if (dt == 0) {
System.out.println(docsTotal.get() + " - " + termFreq.size());
}
if ((dt % 5) != 0) {
processQueue.put(page);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
});
running = false;
System.out.println("Waiting for wrap-up");
Thread.sleep(36000);
for (Thread thread : pt) {
thread.interrupt();
}
for (Thread thread : pt) {
thread.join();
}
System.out.println("Total documents = " + docsTotal.get());
System.out.println("Writing Frequencies");
try (var dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(OUTPUT_FILE)))
) {
synchronized (termFreq) {
for (var entry : termFreq.entrySet()) {
if (entry.getValue() > 5) {
dos.writeLong(entry.getKey());
dos.writeLong(entry.getValue());
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("All done!");
}
public static final ConcurrentHashMap<Long, Integer> termFreq = new ConcurrentHashMap<>();
public static final LanguageModels lm = new LanguageModels(
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
Path.of("/var/lib/wmsa/model/English.RDR"),
Path.of("/var/lib/wmsa/model/English.DICT"),
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
);
public static volatile boolean running = true;
public static void processorThread() {
var ke = new KeywordExtractor();
var se = new SentenceExtractor(lm);
var ps = new PorterStemmer();
try {
TLongHashSet words = new TLongHashSet(10000);
while (running || !processQueue.isEmpty()) {
var job = processQueue.take();
var sentence = se.extractSentences(Jsoup.parse(job.data));
for (var sent : sentence.sentences) {
var keywords = ke.getKeywordsFromSentence(sent);
for (int i = 0; i < keywords.length; i++) {
if (keywords[i].size() > 1) {
words.add(NGramDict.longHash(sent.constructStemmedWordFromSpan(keywords[i]).getBytes()));
}
}
for (String word : sent.wordsLowerCase) {
words.add(NGramDict.longHash(ps.stem(word).getBytes()));
}
words.forEach(l -> {
termFreq.merge(l, 1, Integer::sum);
return true;
});
words.clear();
}
}
}
catch (InterruptedException ex) {
ex.printStackTrace();
}
}
}

View file

@ -1,211 +0,0 @@
package nu.marginalia.wmsa.edge.tools;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner;
import nu.marginalia.util.language.conf.LanguageModels;
import org.jsoup.Jsoup;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
public class ZimConverterMain {
static final LinkedBlockingQueue<ConversionJob> jobQueue = new LinkedBlockingQueue<>(100);
static final LinkedBlockingQueue<String> analysisQueue = new LinkedBlockingQueue<>(100);
static boolean hasData = true;
static final ArchiveClient archiveClient = new ArchiveClient();
static NGramDict dict = new NGramDict(new LanguageModels(
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
Path.of("/var/lib/wmsa/model/English.RDR"),
Path.of("/var/lib/wmsa/model/English.DICT"),
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
)
);
public void extractUrlList() throws IOException {
var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
var urlList = zr.getURLListByURL();
try (PrintWriter pw = new PrintWriter(new FileOutputStream("/home/vlofgren/Work/wikiTitlesAndRedirects.sql"))) {
zr.forEachTitles(
ae -> {
pw.printf("INSERT INTO REF_WIKI_TITLE(NAME) VALUES (\"%s\");\n", ae.getUrl().replace("\\", "\\\\").replace("\"", "\\\""));
},
re -> {
pw.printf("INSERT INTO REF_WIKI_TITLE(NAME, REF_NAME) VALUES (\"%s\",\"%s\");\n", re.getUrl().replace("\\", "\\\\").replace("\"", "\\\""), urlList.get(re.getRedirectIndex()).replace("\\", "\\\\").replace("\"", "\\\""));
}
);
}
}
public static void main(String[] args) throws IOException {
// convertJust("Aleph_number");
// convertJust("FloydSteinberg_dithering");
// convertJust("Laplace's_equation");
// convertJust("John_Fahey");
// convertJust("Plotinus");
// convertJust("C++");
convertAll(args);
archiveClient.close();
}
@SneakyThrows
private static void convertJust(String url) {
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url,
Files.readString(Path.of("/home/vlofgren/Work/wiki-convert/", "in-" + url + ".html")));
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData);
}
private static void extractOne(String which, int clusterId) throws IOException {
// var zr = new ZIMReader(new ZIMFile(args[1]));
var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
int[] cluster = new int[] { clusterId };
if (clusterId == -1) {
zr.forEachTitles(ae -> {
if (ae.getUrl().equals(which)) {
System.err.print(ae.getUrl() + " " + ae.getClusterNumber());
cluster[0] = ae.getClusterNumber();
}
}, re -> {
});
}
System.err.println("Extracting cluster " + cluster[0] );
if (cluster[0] == -1) {
return;
}
zr.forEachArticles((url, art) -> {
if (art != null) {
if (which.equals(url)) {
try {
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/","in-" + url + ".html"), art);
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art);
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData);
} catch (IOException e) {
e.printStackTrace();
}
}
scheduleJob(url, art);
}
}, p -> p == cluster[0]);
}
private static void convertAll(String[] args) throws IOException {
archiveClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0]));
var zr = new ZIMReader(new ZIMFile(args[1]));
// var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
for (int i = 0; i < 8; i++) {
Thread t = new Thread(ZimConverterMain::jobExecutor);
t.setName("Converter");
t.start();
Thread t2 = new Thread(() -> {
for (; ; ) {
String pt;
try {
pt = analysisQueue.take();
} catch (InterruptedException e) {
e.printStackTrace();
return;
}
// var topic = new TopicWordExtractor().extractWords(pt);
// var words = new NGramTextRankExtractor(dict, topic).extractWords(Collections.emptyList(), pt);
// System.out.println(Strings.join(words, ','));
}
});
t2.setName("Analysis");
t2.start();
}
zr.forEachArticles((url, art) -> {
if (art != null) {
scheduleJob(url, art);
}
}, p -> true);
hasData = false;
archiveClient.close();
}
@SneakyThrows
private static void jobExecutor() {
while (hasData || !jobQueue.isEmpty()) {
var job = jobQueue.take();
try {
job.convert();
}
catch (Exception ex) {
System.err.println("Error in " + job.url);
ex.printStackTrace();
}
}
}
@SneakyThrows
private static void scheduleJob(String url, String art) {
jobQueue.put(new ConversionJob(art, url));
}
static final Map<Long, Integer> wordCount = new ConcurrentHashMap<>();
static boolean isKeyword(String word) {
int limit = 100_000;
long n = word.chars().filter(c -> c=='_').count();
if (n == 0) limit = 2;
if (n == 1) limit = 1;
if (n == 2) limit = 1;
if (n >= 3) limit = 1;
long c = word.chars().filter(ch -> ch >= 'a' && ch <= 'z').count();
if (c-2 <= n) {
return false;
}
int hashA = word.hashCode();
int hashB = Objects.hash(n, c, word.length(), word.charAt(0));
long hash = (long) hashA + ((long) hashB << 32);
return wordCount.compute(hash, (k, v) -> v == null ? 1 : v+1) == limit;
}
@AllArgsConstructor
private static class ConversionJob {
private final String data;
private final String url;
public void convert() throws InterruptedException {
var page = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data);
String pt = Jsoup.parse(page).text();
analysisQueue.put(pt);
/*
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data);
if (null != newData) {
archiveClient.submitWiki(Context.internal(), url, newData)
.retry(5)
.blockingSubscribe();
}*/
}
}
}

View file

@ -0,0 +1,34 @@
package nu.marginalia.wmsa.encyclopedia;
import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.wmsa.client.AbstractDynamicClient;
import nu.marginalia.wmsa.client.HttpStatusCode;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
import okhttp3.MediaType;
import org.eclipse.jetty.util.UrlEncoded;
import javax.annotation.CheckReturnValue;
public class EncyclopediaClient extends AbstractDynamicClient {
public EncyclopediaClient() {
super(ServiceDescriptor.ENCYCLOPEDIA);
}
@CheckReturnValue
public Observable<HttpStatusCode> submitWiki(Context ctx, String url, String data) {
return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8"));
}
@CheckReturnValue
public Observable<Boolean> hasWiki(Context ctx, String url) {
return super.get(ctx, "/wiki/has?url="+ UrlEncoded.encodeString(url), Boolean.class);
}
@CheckReturnValue
public Observable<WikiArticles> encyclopediaLookup(Context ctx, String word) {
return super.get(ctx,"/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class);
}
}

View file

@ -0,0 +1,160 @@
package nu.marginalia.wmsa.encyclopedia;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
import nu.marginalia.wmsa.edge.assistant.dict.WikiSearchResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.stream.Collectors;
public class EncyclopediaDao {
private HikariDataSource dataSource;
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaDao.class);
@Inject
public EncyclopediaDao(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
public WikiArticles encyclopedia(String term) {
WikiArticles response = new WikiArticles();
response.entries = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
var stmt = connection.prepareStatement("SELECT DISTINCT(NAME_LOWER) FROM REF_WIKI_TITLE WHERE NAME_LOWER=?");
stmt.setString(1, term);
var rsp = stmt.executeQuery();
while (rsp.next()) {
response.entries.add(capitalizeWikiString(rsp.getString(1)));
}
}
catch (Exception ex) {
logger.error("Failed to fetch articles", ex);
return new WikiArticles();
}
return response;
}
public Optional<String> resolveEncylopediaRedirect(String term) {
final List<String> matches = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term);
var rsp = stmt.executeQuery();
while (rsp.next()) {
if (term.equals(rsp.getString(1))
|| rsp.getString(2) == null) {
return Optional.ofNullable(rsp.getString(2));
} else {
matches.add(rsp.getString(2));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
if (!matches.isEmpty()) {
return Optional.of(matches.get(0));
}
return Optional.empty();
}
public Optional<WikiSearchResult> findEncyclopediaPageDirect(String term) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term.replace(' ', '_'));
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
return Optional.of(new WikiSearchResult(name, null));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
return Optional.empty();
}
public List<WikiSearchResult> findEncyclopediaPages(String term) {
final List<WikiSearchResult> directMatches = new ArrayList<>();
final Set<WikiSearchResult> directSearchMatches = new HashSet<>();
final Set<WikiSearchResult> indirectMatches = new HashSet<>();
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term.replace(' ', '_'));
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
directMatches.add(new WikiSearchResult(name, null));
} else {
indirectMatches.add(new WikiSearchResult(name, refName));
}
}
}
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER LIKE ? LIMIT 10")) {
stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%");
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
directSearchMatches.add(new WikiSearchResult(name, null));
} else {
indirectMatches.add(new WikiSearchResult(name, refName));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
directMatches.forEach(indirectMatches::remove);
indirectMatches.removeAll(directSearchMatches);
directMatches.forEach(directSearchMatches::remove);
directMatches.addAll(indirectMatches);
directMatches.addAll(directSearchMatches);
return directMatches;
}
private String capitalizeWikiString(String string) {
if (string.contains("_")) {
return Arrays.stream(string.split("_")).map(this::capitalizeWikiString).collect(Collectors.joining("_"));
}
if (string.length() < 2) {
return string.toUpperCase();
}
return Character.toUpperCase(string.charAt(0)) + string.substring(1).toLowerCase();
}
}

View file

@ -0,0 +1,29 @@
package nu.marginalia.wmsa.encyclopedia;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.wmsa.configuration.MainClass;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
public class EncyclopediaMain extends MainClass {
private final EncyclopediaService service;
public static void main(String... args) {
init(ServiceDescriptor.ENCYCLOPEDIA, args);
Injector injector = Guice.createInjector(
new EncyclopediaModule(),
new DatabaseModule(),
new ConfigurationModule());
injector.getInstance(EncyclopediaMain.class);
}
@Inject
public EncyclopediaMain(EncyclopediaService service) {
this.service = service;
}
}

View file

@ -0,0 +1,18 @@
package nu.marginalia.wmsa.encyclopedia;
import com.google.inject.AbstractModule;
import com.google.inject.name.Names;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.WmsaHome;
import java.nio.file.Path;
public class EncyclopediaModule extends AbstractModule {
@SneakyThrows
@Override
public void configure() {
bind(Path.class)
.annotatedWith(Names.named("wiki-path"))
.toInstance(WmsaHome.getDisk("encyclopedia"));
}
}

View file

@ -0,0 +1,209 @@
package nu.marginalia.wmsa.encyclopedia;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
public class EncyclopediaService extends Service {
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaService.class);
private final MustacheRenderer<String> wikiErrorPageRenderer;
private final MustacheRenderer<Object> wikiSearchResultRenderer;
private final Gson gson = new GsonBuilder().create();
private Path wikiPath;
private EncyclopediaDao encyclopediaDao;
@Inject
public EncyclopediaService(@Named("service-host") String ip,
@Named("service-port") Integer port,
@Named("wiki-path") Path wikiPath,
EncyclopediaDao encyclopediaDao,
RendererFactory rendererFactory,
Initialization initialization,
MetricsServer metricsServer)
throws IOException {
super(ip, port, initialization, metricsServer);
this.wikiPath = wikiPath;
this.encyclopediaDao = encyclopediaDao;
if (rendererFactory != null) {
wikiErrorPageRenderer = rendererFactory.renderer("encyclopedia/wiki-error");
wikiSearchResultRenderer = rendererFactory.renderer("encyclopedia/wiki-search");
}
else {
wikiErrorPageRenderer = null;
wikiSearchResultRenderer = null;
}
Spark.get("/public/wiki/*", this::getWikiPage);
Spark.get("/public/wiki-search", this::searchWikiPage);
Spark.get("/wiki/has", this::pathWikiHas);
Spark.post("/wiki/submit", this::pathWikiSubmit);
Spark.get("/encyclopedia/:term", (rq, rsp) -> encyclopediaDao.encyclopedia(rq.params("term")), gson::toJson);
Spark.awaitInitialization();
}
@SneakyThrows
private Object getWikiPage(Request req, Response rsp) {
final String[] splats = req.splat();
if (splats.length == 0)
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
final String name = splats[0];
String pageName = encyclopediaDao.resolveEncylopediaRedirect(name).orElse(name);
logger.info("Resolved {} -> {}", name, pageName);
return wikiGet(pageName)
.or(() -> resolveWikiPageNameWrongCase(name))
.orElseGet(() -> renderSearchPage(name));
}
private Optional<String> resolveWikiPageNameWrongCase(String name) {
var rsp = encyclopediaDao.findEncyclopediaPageDirect(name);
if (rsp.isEmpty()) {
return Optional.of(renderSearchPage(name));
}
name = rsp.get().getInternalName();
return wikiGet(name);
}
private String renderSearchPage(String s) {
return wikiSearchResultRenderer.render(
Map.of("query", s,
"error", "true",
"results", encyclopediaDao.findEncyclopediaPages(s)));
}
@SneakyThrows
private Object searchWikiPage(Request req, Response rsp) {
final var ctx = Context.fromRequest(req);
String term = req.queryParams("query");
if (null == term) {
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
return "";
}
return wikiSearchResultRenderer.render(
Map.of("query", term,
"results",
encyclopediaDao.findEncyclopediaPages(term))
);
}
private Path getWikiFilename(Path base, String url) {
Path p = base;
int urlHash = url.hashCode();
p = p.resolve(Integer.toString(urlHash & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>8) & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>16) & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>24) & 0xFF));
String fileName = url.chars()
.mapToObj(this::encodeUrlChar)
.collect(Collectors.joining());
if (fileName.length() > 128) {
fileName = fileName.substring(0, 128) + (((long)urlHash)&0xFFFFFFFFL);
}
return p.resolve(fileName + ".gz");
}
private String encodeUrlChar(int i) {
if (i >= 'a' && i <= 'z') {
return Character.toString(i);
}
if (i >= 'A' && i <= 'Z') {
return Character.toString(i);
}
if (i >= '0' && i <= '9') {
return Character.toString(i);
}
if (i == '.') {
return Character.toString(i);
}
else {
return String.format("%%%2X", i);
}
}
@SneakyThrows
private Object pathWikiHas(Request request, Response response) {
return Files.exists(getWikiFilename(wikiPath, request.queryParams("url")));
}
@SneakyThrows
private Optional<String> wikiGet(String name) {
var filename = getWikiFilename(wikiPath, name);
if (Files.exists(filename)) {
try (var stream = new GZIPInputStream(new FileInputStream(filename.toFile()))) {
return Optional.of(new String(stream.readAllBytes()));
}
} else {
return Optional.empty();
}
}
@SneakyThrows
private Object pathWikiSubmit(Request request, Response response) {
byte[] data = request.bodyAsBytes();
String wikiUrl = request.queryParams("url");
Path filename = getWikiFilename(wikiPath, wikiUrl);
Files.createDirectories(filename.getParent());
logger.debug("Writing {} to {}", wikiUrl, filename);
try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) {
gos.write(data);
gos.flush();
}
return "ok";
}
}

View file

@ -254,4 +254,29 @@ CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, IND
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE);
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
---;
DROP TABLE IF EXISTS REF_DICTIONARY;
CREATE TABLE IF NOT EXISTS REF_DICTIONARY(
TYPE VARCHAR(16),
WORD VARCHAR(255),
DEFINITION VARCHAR(255)
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE INDEX IF NOT EXISTS REF_DICTIONARY_WORD ON REF_DICTIONARY (WORD);
CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE (
NAME VARCHAR(255),
NAME_LOWER VARCHAR(255) GENERATED ALWAYS AS (LOWER(NAME)),
REF_NAME VARCHAR(255)
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER);
CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME);

View file

@ -18,6 +18,5 @@ CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE(
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER);
CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME);

View file

@ -50,8 +50,6 @@ class ServiceTest {
new DictionaryService(dataSource, new SpellChecker()),
new MathParser(),
new Units(new MathParser()),
null,
null,
new ScreenshotService(null), null);
Spark.awaitInitialization();

View file

@ -1,72 +0,0 @@
package nu.marginalia.wmsa.edge.archive;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.edge.archive.archiver.Archiver;
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import org.junit.jupiter.api.parallel.ExecutionMode;
import spark.Spark;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import static nu.marginalia.util.TestUtil.getPort;
import static nu.marginalia.util.test.TestUtil.clearTempDir;
@Execution(ExecutionMode.SAME_THREAD)
public class ArchiveTest {
static EdgeArchiveService service;
static final int testPort = getPort();
private static Path tempPath;
private static Path tempPath2;
private static ArchiveClient archiveClient;
private static Archiver archiver;
@BeforeAll
public static void setUpClass() throws IOException {
Spark.port(testPort);
System.setProperty("service-name", "edge-archive");
archiveClient = new ArchiveClient();
archiveClient.setServiceRoute("127.0.0.1", testPort);
tempPath = Files.createTempDirectory("archiveTest");
tempPath2 = Files.createTempDirectory("wikiTest");
archiver = new Archiver(tempPath, 10);
service = new EdgeArchiveService("127.0.0.1", testPort,
tempPath,
archiver,
new Initialization(), null);
Spark.awaitInitialization();
}
@AfterAll
public static void tearDown() throws Exception {
archiver.close();
archiveClient.close();
clearTempDir(tempPath);
clearTempDir(tempPath2);
}
@SneakyThrows
@Test
public void testWiki() {
var url = "Plato_(Disambiguation)";
Assertions.assertFalse(archiveClient.hasWiki(Context.internal(), url).blockingFirst());
archiveClient.submitWiki(Context.internal(), url, "<h1>Hello</h1>").blockingFirst();
Assertions.assertTrue(archiveClient.hasWiki(Context.internal(), url).blockingFirst());
Assertions.assertEquals("<h1>Hello</h1>", archiveClient.getWiki(Context.internal(), url).blockingFirst());
}
}

View file

@ -1,17 +0,0 @@
package nu.marginalia.wmsa.edge.archive.archiver;
import org.junit.jupiter.api.*;
import java.nio.file.Path;
public class ArchiverTest {
@Test
public void testArchiver() throws Exception {
Archiver archiver = new Archiver(Path.of("/tmp/"), 3);
archiver.writeData(new ArchivedFile("file1", "Hey".getBytes()));
archiver.writeData(new ArchivedFile("file2", "Hey".getBytes()));
archiver.writeData(new ArchivedFile("file3", "Hey".getBytes()));
archiver.writeData(new ArchivedFile("file4", "Hey".getBytes()));
archiver.close();
}
}

View file

@ -60,7 +60,6 @@ class AssistantTest {
new DictionaryService(dataSource, new SpellChecker()),
new MathParser(),
new Units(new MathParser()),
null, null,
new ScreenshotService(null), null);
Spark.awaitInitialization();
@ -77,12 +76,6 @@ class AssistantTest {
Spark.awaitStop();
}
@Test
public void testEncyclopedia() {
var result = client.encyclopediaLookup(Context.internal(), "plato").blockingFirst();
System.out.println(result);
assertTrue(result.entries.size() >= 1);
}
@Test
public void testSpellCheck() {
var result = client.spellCheck(Context.internal(), "plato").blockingFirst();