Rewrote Encyclopedia loader, added functioning E2E test for new encyclopedia service

This commit is contained in:
vlofgren 2022-05-28 13:51:29 +02:00
parent ad4521da9e
commit ac9064096d
10 changed files with 206 additions and 219 deletions

View file

@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge;
import nu.marginalia.util.test.TestUtil;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Tag;
@ -19,7 +18,6 @@ import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.MountableFile;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
@ -28,7 +26,6 @@ import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
import static org.testcontainers.containers.BrowserWebDriverContainer.VncRecordingMode.RECORD_ALL;
@Tag("e2e")
@Testcontainers

View file

@ -3,11 +3,21 @@ package nu.marginalia.wmsa.edge;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.openqa.selenium.By;
import org.openqa.selenium.chrome.ChromeOptions;
import org.slf4j.LoggerFactory;
import org.testcontainers.containers.BindMode;
import org.testcontainers.containers.BrowserWebDriverContainer;
import org.testcontainers.containers.GenericContainer;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.containers.Network;
import org.testcontainers.containers.NginxContainer;
import org.testcontainers.containers.output.Slf4jLogConsumer;
import org.testcontainers.containers.wait.strategy.Wait;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.MountableFile;
import java.nio.file.Path;
import java.time.Duration;
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA;
@ -19,9 +29,42 @@ public class EncyclopediaE2ETest extends E2ETestBase {
@Container
public GenericContainer<?> encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB);
@Container
public GenericContainer<?> encyclopediaLoader = new GenericContainer<>("openjdk:17-alpine")
.dependsOn(encyclopediaContainer)
.dependsOn(mariaDB)
.withNetwork(network)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("encyclopedia-loader")))
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
.withCopyFileToContainer(MountableFile.forClasspathResource("load-encyclopedia.sh"), "/load-encyclopedia.sh")
.withFileSystemBind(getModelData().toString(), "/data", BindMode.READ_ONLY)
.withCommand("sh", "load-encyclopedia.sh")
.waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10)));
@Container
public NginxContainer<?> proxyNginx = new NginxContainer<>("nginx:stable")
.dependsOn(encyclopediaLoader)
.dependsOn(encyclopediaContainer)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx")))
.withCopyFileToContainer(MountableFile.forClasspathResource("nginx/encyclopedia.conf"), "/etc/nginx/conf.d/default.conf")
.withNetwork(network)
.withNetworkAliases("proxyNginx");
@Container
public BrowserWebDriverContainer<?> chrome = new BrowserWebDriverContainer<>()
.withNetwork(network)
.withCapabilities(new ChromeOptions());
private Path getModelData() {
return Path.of(System.getProperty("user.dir")).resolve("data/test");
}
@Test
public void run() {
var driver = chrome.getWebDriver();
driver.get("http://proxyNginx/wiki/Frog");
System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
}
}

View file

@ -0,0 +1,32 @@
#!/bin/bash
mkdir -p /var/lib/wmsa/conf/
mkdir -p /var/lib/wmsa/data/
mkdir -p /data
cat > /var/lib/wmsa/conf/db.properties <<EOF
db.user=wmsa
db.pass=wmsa
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
EOF
cat > /var/lib/wmsa/conf/hosts <<EOF
# service-name host-name
resource-store resource-store
renderer renderer
auth auth
api api
smhi-scraper smhi-scraper
podcast-scraper podcast-scraper
edge-index edge-index
edge-search edge-search
encyclopedia encyclopedia
edge-assistant edge-assistant
memex memex
dating dating
EOF
java -cp WMSA.jar nu.marginalia.wmsa.edge.tools.EncyclopediaLoaderTool data/wikipedia_en_100_nopic.zim
echo "ALL DONE"

View file

@ -0,0 +1,40 @@
server {
listen 80;
listen [::]:80;
server_name nginx;
location /wiki/ {
rewrite ^ $request_uri;
rewrite ^/(.*) /public/$1 break;
return 400;
proxy_pass http://encyclopedia:5040$uri;
proxy_set_header X-Context $remote_addr-$connection;
proxy_set_header X-Public "1";
proxy_set_header X-Extern-Url $scheme://$host$request_uri;
proxy_set_header X-Extern-Domain $scheme://$host;
proxy_set_header X-User-Agent $http_user_agent;
tcp_nodelay on;
}
location /wiki-search {
rewrite ^ $request_uri;
rewrite ^/(.*) /public/$1 break;
return 400;
proxy_pass http://encyclopedia:5040$uri;
proxy_set_header X-Context $remote_addr-$connection;
proxy_set_header X-Public "1";
proxy_set_header X-Extern-Url $scheme://$host$request_uri;
proxy_set_header X-Extern-Domain $scheme://$host;
proxy_set_header X-User-Agent $http_user_agent;
tcp_nodelay on;
}
location / {
proxy_pass http://encyclopedia:5040/;
tcp_nodelay on;
}
}

View file

@ -0,0 +1,59 @@
package nu.marginalia.wmsa.edge.tools;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner;
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
import java.io.IOException;
import java.util.concurrent.*;
public class EncyclopediaLoaderTool {
static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient();
public static void main(String[] args) throws IOException, InterruptedException {
convertAll(args);
encyclopediaClient.close();
System.exit(0);
}
private static void convertAll(String[] args) throws IOException, InterruptedException {
var zr = new ZIMReader(new ZIMFile(args[0]));
var pool = Executors.newFixedThreadPool(8);
var sem = new Semaphore(12);
zr.forEachArticles((url, art) -> {
if (art != null) {
try {
sem.acquire();
pool.execute(() -> {
try {
convert(url, art);
} finally {
sem.release();
}
});
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}, p -> true);
sem.acquire(12);
encyclopediaClient.close();
}
private static void convert(String url, String art) {
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art);
if (null != newData) {
encyclopediaClient.submitWiki(Context.internal(), url, newData)
.retry(5)
.blockingSubscribe();
}
}
}

View file

@ -1,211 +0,0 @@
package nu.marginalia.wmsa.edge.tools;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
import org.jsoup.Jsoup;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
public class ZimConverterMain {
static final LinkedBlockingQueue<ConversionJob> jobQueue = new LinkedBlockingQueue<>(100);
static final LinkedBlockingQueue<String> analysisQueue = new LinkedBlockingQueue<>(100);
static boolean hasData = true;
static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient();
static NGramDict dict = new NGramDict(new LanguageModels(
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
Path.of("/var/lib/wmsa/model/English.RDR"),
Path.of("/var/lib/wmsa/model/English.DICT"),
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
)
);
public void extractUrlList() throws IOException {
var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
var urlList = zr.getURLListByURL();
try (PrintWriter pw = new PrintWriter(new FileOutputStream("/home/vlofgren/Work/wikiTitlesAndRedirects.sql"))) {
zr.forEachTitles(
ae -> {
pw.printf("INSERT INTO REF_WIKI_TITLE(NAME) VALUES (\"%s\");\n", ae.getUrl().replace("\\", "\\\\").replace("\"", "\\\""));
},
re -> {
pw.printf("INSERT INTO REF_WIKI_TITLE(NAME, REF_NAME) VALUES (\"%s\",\"%s\");\n", re.getUrl().replace("\\", "\\\\").replace("\"", "\\\""), urlList.get(re.getRedirectIndex()).replace("\\", "\\\\").replace("\"", "\\\""));
}
);
}
}
public static void main(String[] args) throws IOException {
// convertJust("Aleph_number");
// convertJust("FloydSteinberg_dithering");
// convertJust("Laplace's_equation");
// convertJust("John_Fahey");
// convertJust("Plotinus");
// convertJust("C++");
convertAll(args);
encyclopediaClient.close();
}
@SneakyThrows
private static void convertJust(String url) {
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url,
Files.readString(Path.of("/home/vlofgren/Work/wiki-convert/", "in-" + url + ".html")));
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData);
}
private static void extractOne(String which, int clusterId) throws IOException {
// var zr = new ZIMReader(new ZIMFile(args[1]));
var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
int[] cluster = new int[] { clusterId };
if (clusterId == -1) {
zr.forEachTitles(ae -> {
if (ae.getUrl().equals(which)) {
System.err.print(ae.getUrl() + " " + ae.getClusterNumber());
cluster[0] = ae.getClusterNumber();
}
}, re -> {
});
}
System.err.println("Extracting cluster " + cluster[0] );
if (cluster[0] == -1) {
return;
}
zr.forEachArticles((url, art) -> {
if (art != null) {
if (which.equals(url)) {
try {
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/","in-" + url + ".html"), art);
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art);
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData);
} catch (IOException e) {
e.printStackTrace();
}
}
scheduleJob(url, art);
}
}, p -> p == cluster[0]);
}
private static void convertAll(String[] args) throws IOException {
encyclopediaClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0]));
var zr = new ZIMReader(new ZIMFile(args[1]));
// var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
for (int i = 0; i < 8; i++) {
Thread t = new Thread(ZimConverterMain::jobExecutor);
t.setName("Converter");
t.start();
Thread t2 = new Thread(() -> {
for (; ; ) {
String pt;
try {
pt = analysisQueue.take();
} catch (InterruptedException e) {
e.printStackTrace();
return;
}
// var topic = new TopicWordExtractor().extractWords(pt);
// var words = new NGramTextRankExtractor(dict, topic).extractWords(Collections.emptyList(), pt);
// System.out.println(Strings.join(words, ','));
}
});
t2.setName("Analysis");
t2.start();
}
zr.forEachArticles((url, art) -> {
if (art != null) {
scheduleJob(url, art);
}
}, p -> true);
hasData = false;
encyclopediaClient.close();
}
@SneakyThrows
private static void jobExecutor() {
while (hasData || !jobQueue.isEmpty()) {
var job = jobQueue.take();
try {
job.convert();
}
catch (Exception ex) {
System.err.println("Error in " + job.url);
ex.printStackTrace();
}
}
}
@SneakyThrows
private static void scheduleJob(String url, String art) {
jobQueue.put(new ConversionJob(art, url));
}
static final Map<Long, Integer> wordCount = new ConcurrentHashMap<>();
static boolean isKeyword(String word) {
int limit = 100_000;
long n = word.chars().filter(c -> c=='_').count();
if (n == 0) limit = 2;
if (n == 1) limit = 1;
if (n == 2) limit = 1;
if (n >= 3) limit = 1;
long c = word.chars().filter(ch -> ch >= 'a' && ch <= 'z').count();
if (c-2 <= n) {
return false;
}
int hashA = word.hashCode();
int hashB = Objects.hash(n, c, word.length(), word.charAt(0));
long hash = (long) hashA + ((long) hashB << 32);
return wordCount.compute(hash, (k, v) -> v == null ? 1 : v+1) == limit;
}
@AllArgsConstructor
private static class ConversionJob {
private final String data;
private final String url;
public void convert() throws InterruptedException {
var page = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data);
String pt = Jsoup.parse(page).text();
analysisQueue.put(pt);
/*
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data);
if (null != newData) {
archiveClient.submitWiki(Context.internal(), url, newData)
.retry(5)
.blockingSubscribe();
}*/
}
}
}

View file

@ -6,6 +6,7 @@ import com.google.inject.Injector;
import nu.marginalia.wmsa.configuration.MainClass;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
public class EncyclopediaMain extends MainClass {
private final EncyclopediaService service;
@ -15,6 +16,7 @@ public class EncyclopediaMain extends MainClass {
Injector injector = Guice.createInjector(
new EncyclopediaModule(),
new DatabaseModule(),
new ConfigurationModule());
injector.getInstance(EncyclopediaMain.class);
}

View file

@ -62,6 +62,8 @@ public class EncyclopediaService extends Service {
Spark.get("/wiki/has", this::pathWikiHas);
Spark.post("/wiki/submit", this::pathWikiSubmit);
Spark.awaitInitialization();
}
@ -190,7 +192,6 @@ public class EncyclopediaService extends Service {
Files.createDirectories(filename.getParent());
System.out.println(new String(data));
logger.debug("Writing {} to {}", wikiUrl, filename);
try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) {

View file

@ -254,4 +254,29 @@ CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, IND
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE);
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
---;
DROP TABLE IF EXISTS REF_DICTIONARY;
CREATE TABLE IF NOT EXISTS REF_DICTIONARY(
TYPE VARCHAR(16),
WORD VARCHAR(255),
DEFINITION VARCHAR(255)
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE INDEX IF NOT EXISTS REF_DICTIONARY_WORD ON REF_DICTIONARY (WORD);
CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE(
NAME VARCHAR(255),
NAME_LOWER VARCHAR(255) GENERATED ALWAYS AS (LOWER(NAME)),
REF_NAME VARCHAR(255)
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER);
CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME);

View file

@ -18,6 +18,5 @@ CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE(
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER);
CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME);