WIP: Killing off Archive service, adding new Encyclopedia service consisting largely of what Archive was and a few features from Assistant.

This commit is contained in:
vlofgren 2022-05-27 23:45:29 +02:00
parent 61ef2b06b0
commit e7b4ac0d34
24 changed files with 438 additions and 1333 deletions

View File

@ -6,11 +6,11 @@ import nu.marginalia.wmsa.configuration.command.Command;
import nu.marginalia.wmsa.configuration.command.ListCommand;
import nu.marginalia.wmsa.configuration.command.StartCommand;
import nu.marginalia.wmsa.configuration.command.VersionCommand;
import nu.marginalia.wmsa.edge.archive.EdgeArchiveMain;
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
import nu.marginalia.wmsa.edge.dating.DatingMain;
import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
import nu.marginalia.wmsa.edge.search.EdgeSearchMain;
import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain;
import nu.marginalia.wmsa.memex.MemexMain;
import nu.marginalia.wmsa.podcasts.PodcastScraperMain;
import nu.marginalia.wmsa.renderer.RendererMain;
@ -33,11 +33,12 @@ public enum ServiceDescriptor {
EDGE_INDEX("edge-index", 5021, EdgeIndexMain.class),
EDGE_SEARCH("edge-search", 5023, EdgeSearchMain.class),
EDGE_ARCHIVE("edge-archive", 5024, EdgeArchiveMain.class),
EDGE_ASSISTANT("edge-assistant", 5025, EdgeAssistantMain.class),
EDGE_MEMEX("memex", 5030, MemexMain.class),
ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class),
DATING("dating", 5070, DatingMain.class),
TEST_1("test-1", 0, null),

View File

@ -1,33 +0,0 @@
package nu.marginalia.wmsa.edge.archive;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.wmsa.configuration.MainClass;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.configuration.server.Initialization;
public class EdgeArchiveMain extends MainClass {
private final EdgeArchiveService service;
@Inject
public EdgeArchiveMain(EdgeArchiveService service) {
this.service = service;
}
public static void main(String... args) {
init(ServiceDescriptor.EDGE_ARCHIVE, args);
Injector injector = Guice.createInjector(
new EdgeArchiveModule(),
new ConfigurationModule(),
new DatabaseModule()
);
injector.getInstance(EdgeArchiveMain.class);
injector.getInstance(Initialization.class).setReady();
}
}

View File

@ -1,15 +0,0 @@
package nu.marginalia.wmsa.edge.archive;
import com.google.inject.AbstractModule;
import com.google.inject.name.Names;
import java.nio.file.Path;
public class EdgeArchiveModule extends AbstractModule {
public void configure() {
bind(Path.class).annotatedWith(Names.named("archive-path")).toInstance(Path.of("/var/lib/wmsa/archive/webpage/"));
bind(Path.class).annotatedWith(Names.named("wiki-path")).toInstance(Path.of("/var/lib/wmsa/archive.fast/wiki/"));
bind(Integer.class).annotatedWith(Names.named("archive-size")).toInstance(10_000);
}
}

View File

@ -1,180 +0,0 @@
package nu.marginalia.wmsa.edge.archive;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import io.prometheus.client.Histogram;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.edge.archive.archiver.ArchivedFile;
import nu.marginalia.wmsa.edge.archive.archiver.Archiver;
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
public class EdgeArchiveService extends Service {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = new GsonBuilder().create();
private static final Histogram wmsa_archive_store_time = Histogram.build().name("wmsa_archive_store_time").help("-").register();
private static final Histogram wmsa_archive_fetch_time = Histogram.build().name("wmsa_archive_fetch_time").help("-").register();
private final Path wikiPath;
private final Archiver archiver;
@SneakyThrows
@Inject
public EdgeArchiveService(@Named("service-host") String ip,
@Named("service-port") Integer port,
@Named("wiki-path") Path wikiPath,
Archiver archiver,
Initialization initialization,
MetricsServer metricsServer)
{
super(ip, port, initialization, metricsServer);
this.wikiPath = wikiPath;
this.archiver = archiver;
Spark.staticFiles.expireTime(600);
Spark.post("/page/submit", this::pathPageSubmit);
Spark.post("/wiki/submit", this::pathWikiSubmit);
Spark.get("/wiki/has", this::pathWikiHas);
Spark.get("/wiki/get", this::pathWikiGet);
Spark.awaitInitialization();
}
@SneakyThrows
private Object pathPageSubmit(Request request, Response response) {
var timer = wmsa_archive_store_time.startTimer();
try {
var body = request.body();
var data = gson.fromJson(body, EdgeArchiveSubmissionReq.class);
String domainNamePart = data.getUrl().domain.domain.length() > 32 ? data.getUrl().domain.domain.substring(0, 32) : data.getUrl().domain.domain;
String fileName = String.format("%s-%10d", domainNamePart, data.getUrl().hashCode());
archiver.writeData(new ArchivedFile(fileName, body.getBytes()));
return "ok";
} finally {
timer.observeDuration();
}
}
@SneakyThrows
private Object pathWikiSubmit(Request request, Response response) {
var timer = wmsa_archive_store_time.startTimer();
try {
byte[] data = request.bodyAsBytes();
String wikiUrl = request.queryParams("url");
Path filename = getWikiFilename(wikiPath, wikiUrl);
Files.createDirectories(filename.getParent());
System.out.println(new String(data));
logger.debug("Writing {} to {}", wikiUrl, filename);
try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) {
gos.write(data);
gos.flush();
}
return "ok";
} finally {
timer.observeDuration();
}
}
private Path getWikiFilename(Path base, String url) {
Path p = base;
int urlHash = url.hashCode();
p = p.resolve(Integer.toString(urlHash & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>8) & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>16) & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>24) & 0xFF));
String fileName = url.chars()
.mapToObj(this::encodeUrlChar)
.collect(Collectors.joining());
if (fileName.length() > 128) {
fileName = fileName.substring(0, 128) + (((long)urlHash)&0xFFFFFFFFL);
}
return p.resolve(fileName + ".gz");
}
private String encodeUrlChar(int i) {
if (i >= 'a' && i <= 'z') {
return Character.toString(i);
}
if (i >= 'A' && i <= 'Z') {
return Character.toString(i);
}
if (i >= '0' && i <= '9') {
return Character.toString(i);
}
if (i == '.') {
return Character.toString(i);
}
else {
return String.format("%%%2X", i);
}
}
@SneakyThrows
private Object pathWikiHas(Request request, Response response) {
return Files.exists(getWikiFilename(wikiPath, request.queryParams("url")));
}
@SneakyThrows
private String pathWikiGet(Request request, Response response) {
var timer = wmsa_archive_fetch_time.startTimer();
try {
String url = request.queryParams("url");
var filename = getWikiFilename(wikiPath, url);
if (Files.exists(filename)) {
try (var stream = new GZIPInputStream(new FileInputStream(filename.toFile()))) {
return new String(stream.readAllBytes());
}
} else {
Spark.halt(404);
return null;
}
}
finally {
timer.observeDuration();
}
}
}

View File

@ -1,65 +0,0 @@
package nu.marginalia.wmsa.edge.archive.archiver;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.function.Consumer;
public class ArchiveExtractor {
private final Path archivePath;
private final String arhivePattern = "archive-%04d.tar.gz";
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = new GsonBuilder().create();
public ArchiveExtractor(Path archivePath) {
this.archivePath = archivePath;
}
public void forEach(Consumer<EdgeRawPageContents> contents) {
for (int i = 0; ; ++i) {
var fn = getArchiveFile(i);
logger.info("{}", fn);
if (!Files.exists(fn)) {
break;
}
try (var stream = new TarArchiveInputStream(new GzipCompressorInputStream(new BufferedInputStream(new FileInputStream(fn.toFile()))))) {
TarArchiveEntry entry;
while ((entry = stream.getNextTarEntry()) != null) {
if (entry.isFile()) {
try {
var obj = gson.fromJson(new InputStreamReader(stream), EdgeArchiveSubmissionReq.class);
if (obj != null) {
contents.accept(obj.getData());
}
}
catch (Exception ex) {
logger.error("Could not unpack {} - {} {}", entry.getName(), ex.getClass().getSimpleName(), ex.getMessage());
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
private Path getArchiveFile(int number) {
final String fileName = String.format(arhivePattern, number);
return archivePath.resolve(fileName);
}
}

View File

@ -1,5 +0,0 @@
package nu.marginalia.wmsa.edge.archive.archiver;
public record ArchivedFile(String filename,byte[] data ) {
}

View File

@ -1,113 +0,0 @@
package nu.marginalia.wmsa.edge.archive.archiver;
import com.google.inject.name.Named;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Singleton;
import java.io.ByteArrayInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.TimeUnit;
@Singleton
public class Archiver implements AutoCloseable {
private final Path archivePath;
private final int filesPerArchive;
private final String arhivePattern = "archive-%04d.tar.gz";
private final LinkedBlockingDeque<ArchivedFile> writeQueue = new LinkedBlockingDeque<>(10);
private final Thread writeThread;
private volatile int archiveNumber;
private volatile boolean running;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public Archiver(@Named("archive-path") Path archivePath, @Named("archive-size") Integer filesPerArchive) {
this.archivePath = archivePath;
this.filesPerArchive = filesPerArchive;
if (!Files.exists(archivePath)) {
throw new IllegalArgumentException("Archive path does not exist");
}
for (int i = 0;; ++i) {
if (!Files.exists(getArchiveFile(i))) {
archiveNumber = i;
break;
}
}
running = true;
writeThread = new Thread(this::writeThreadMain, "ArchiveWriteThread");
writeThread.start();
}
private Path getArchiveFile(int number) {
final String fileName = String.format(arhivePattern, number);
return archivePath.resolve(fileName);
}
public void writeData(ArchivedFile file) throws InterruptedException {
if (!running) throw new IllegalStateException("Archiver is closing or closed");
writeQueue.put(file);
}
private void writeThreadMain() {
try {
while (running || !writeQueue.isEmpty()) {
writeToFile(archiveNumber);
archiveNumber++;
}
running = false;
}
catch (Exception ex) {
logger.error("Uncaught exception in writer thread!!");
}
}
private void writeToFile(int archiveNumber) {
var archiveFile = getArchiveFile(archiveNumber);
logger.info("Switching to file {}", archiveFile);
try (TarArchiveOutputStream taos = new TarArchiveOutputStream(new GzipCompressorOutputStream(new FileOutputStream(archiveFile.toFile())))) {
for (int i = 0; i < filesPerArchive; i++) {
ArchivedFile writeJob = null;
while (writeJob == null) {
writeJob = writeQueue.poll(1, TimeUnit.SECONDS);
if (!running) return;
}
var entry = new TarArchiveEntry(String.format("%06d-%s", i, writeJob.filename()));
entry.setSize(writeJob.data().length);
taos.putArchiveEntry(entry);
logger.debug("Writing {} to {}", writeJob.filename(), archiveFile);
try (var bais = new ByteArrayInputStream(writeJob.data())) {
IOUtils.copy(bais, taos);
}
taos.closeArchiveEntry();
}
taos.finish();
logger.debug("Finishing {}", archiveFile);
} catch (Exception e) {
logger.error("Error", e);
}
}
@Override
public void close() throws Exception {
running = false;
writeThread.join();
}
}

View File

@ -1,56 +0,0 @@
package nu.marginalia.wmsa.edge.archive.client;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.wmsa.client.AbstractDynamicClient;
import nu.marginalia.wmsa.client.HttpStatusCode;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import okhttp3.MediaType;
import org.eclipse.jetty.util.UrlEncoded;
import javax.annotation.CheckReturnValue;
import java.util.concurrent.Semaphore;
@Singleton
public class ArchiveClient extends AbstractDynamicClient {
private final Semaphore submitPageSem = new Semaphore(3, true);
@Inject
public ArchiveClient() {
super(ServiceDescriptor.EDGE_ARCHIVE);
}
@CheckReturnValue
public void submitPage(Context ctx, EdgeUrl url, EdgeRawPageContents data) throws InterruptedException {
try {
submitPageSem.acquire();
super.post(ctx, "/page/submit", new EdgeArchiveSubmissionReq(url, data)).blockingSubscribe();
}
finally {
submitPageSem.release();
}
}
@CheckReturnValue
public Observable<HttpStatusCode> submitWiki(Context ctx, String url, String data) {
return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8"));
}
@CheckReturnValue
public Observable<Boolean> hasWiki(Context ctx, String url) {
return super.get(ctx, "/wiki/has?url="+UrlEncoded.encodeString(url), Boolean.class);
}
@CheckReturnValue
public Observable<String> getWiki(Context ctx, String url) {
return super.get(ctx, "/wiki/get?url="+UrlEncoded.encodeString(url));
}
}

View File

@ -1,13 +0,0 @@
package nu.marginalia.wmsa.edge.archive.request;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
@AllArgsConstructor @Getter @ToString
public class EdgeArchiveSubmissionReq {
EdgeUrl url;
EdgeRawPageContents data;
}

View File

@ -4,36 +4,27 @@ import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import io.reactivex.rxjava3.core.Observable;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.server.*;
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.edge.assistant.dict.DictionaryService;
import nu.marginalia.wmsa.edge.assistant.eval.MathParser;
import nu.marginalia.wmsa.edge.assistant.eval.Units;
import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
import nu.marginalia.wmsa.edge.assistant.suggest.Suggestions;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.util.Map;
public class EdgeAssistantService extends Service {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = new GsonBuilder().create();
private final Units units;
private final DictionaryService dictionaryService;
private final MathParser mathParser;
private final ArchiveClient archiveClient;
private final ScreenshotService screenshotService;
private final MustacheRenderer<String> wikiErrorPageRenderer;
private final MustacheRenderer<Object> wikiSearchResultRenderer;
private final Suggestions suggestions;
@SneakyThrows
@ -45,40 +36,22 @@ public class EdgeAssistantService extends Service {
DictionaryService dictionaryService,
MathParser mathParser,
Units units,
ArchiveClient archiveClient,
RendererFactory rendererFactory,
ScreenshotService screenshotService,
Suggestions suggestions
)
{
super(ip, port, initialization, metricsServer);
this.dictionaryService = dictionaryService;
this.mathParser = mathParser;
this.units = units;
this.archiveClient = archiveClient;
this.screenshotService = screenshotService;
this.suggestions = suggestions;
Spark.staticFiles.expireTime(600);
if (rendererFactory != null) {
wikiErrorPageRenderer = rendererFactory.renderer("encyclopedia/wiki-error");
wikiSearchResultRenderer = rendererFactory.renderer("encyclopedia/wiki-search");
}
else {
wikiErrorPageRenderer = null;
wikiSearchResultRenderer = null;
}
Spark.get("/public/wiki/*", this::getWikiPage);
Spark.get("/public/wiki-search", this::searchWikiPage);
Spark.get("/public/screenshot/:id", screenshotService::serveScreenshotRequest);
Spark.get("/screenshot/:id", screenshotService::serveScreenshotRequest);
Spark.get("/dictionary/:word", (req, rsp) -> dictionaryService.define(req.params("word")), this::convertToJson);
Spark.get("/spell-check/:term", (req, rsp) -> dictionaryService.spellCheck(req.params("term").toLowerCase()), this::convertToJson);
Spark.get("/encyclopedia/:term", (req, rsp) -> dictionaryService.encyclopedia(req.params("term")), this::convertToJson);
Spark.get("/unit-conversion", (req, rsp) -> unitConversion(
rsp,
req.queryParams("value"),
@ -106,57 +79,6 @@ public class EdgeAssistantService extends Service {
return suggestions.getSuggestions(10, param);
}
@SneakyThrows
private Object getWikiPage(Request req, Response rsp) {
final var ctx = Context.fromRequest(req);
final String[] splats = req.splat();
if (splats.length == 0)
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
final String s = splats[0];
String pageName = dictionaryService.resolveEncylopediaRedirect(s).orElse(s);
logger.info("Resolved {} -> {}", s, pageName);
return archiveClient.getWiki(ctx, pageName)
.onErrorResumeWith(resolveWikiPageNameWrongCase(ctx, s))
.blockingFirst();
}
private Observable<String> resolveWikiPageNameWrongCase(Context ctx, String s) {
var rsp = dictionaryService.findEncyclopediaPageDirect(s);
if (rsp.isEmpty()) {
return renderSearchPage(s);
}
return archiveClient.getWiki(ctx, rsp.get().getInternalName())
.onErrorResumeWith(renderSearchPage(s));
}
private Observable<String> renderSearchPage(String s) {
return Observable.fromCallable(() -> wikiSearchResultRenderer.render(
Map.of("query", s,
"error", "true",
"results", dictionaryService.findEncyclopediaPages(s))));
}
@SneakyThrows
private Object searchWikiPage(Request req, Response rsp) {
final var ctx = Context.fromRequest(req);
String term = req.queryParams("query");
if (null == term) {
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
return "";
}
return wikiSearchResultRenderer.render(
Map.of("query", term,
"results",
dictionaryService.findEncyclopediaPages(term))
);
}
private Object evalExpression(Response rsp, String value) {
try {
var val = mathParser.evalFormatted(value);

View File

@ -24,10 +24,6 @@ public class AssistantClient extends AbstractDynamicClient {
return super.get(ctx,"/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class);
}
public Observable<WikiArticles> encyclopediaLookup(Context ctx, String word) {
return super.get(ctx,"/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class);
}
@SuppressWarnings("unchecked")
public Observable<List<String>> spellCheck(Context ctx, String word) {
return (Observable<List<String>>) (Object) super.get(ctx,"/spell-check/" + UrlEncoded.encodeString(word), List.class);

View File

@ -43,142 +43,6 @@ public class DictionaryService {
return response;
}
public WikiArticles encyclopedia(String term) {
WikiArticles response = new WikiArticles();
response.entries = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
var stmt = connection.prepareStatement("SELECT DISTINCT(NAME_LOWER) FROM REF_WIKI_TITLE WHERE NAME_LOWER=?");
stmt.setString(1, term);
var rsp = stmt.executeQuery();
while (rsp.next()) {
response.entries.add(capitalizeWikiString(rsp.getString(1)));
}
}
catch (Exception ex) {
logger.error("Failed to fetch articles", ex);
return new WikiArticles();
}
return response;
}
public Optional<String> resolveEncylopediaRedirect(String term) {
final List<String> matches = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term);
var rsp = stmt.executeQuery();
while (rsp.next()) {
if (term.equals(rsp.getString(1))
|| rsp.getString(2) == null) {
return Optional.ofNullable(rsp.getString(2));
} else {
matches.add(rsp.getString(2));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
if (!matches.isEmpty()) {
return Optional.of(matches.get(0));
}
return Optional.empty();
}
public Optional<WikiSearchResult> findEncyclopediaPageDirect(String term) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term.replace(' ', '_'));
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
return Optional.of(new WikiSearchResult(name, null));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
return Optional.empty();
}
public List<WikiSearchResult> findEncyclopediaPages(String term) {
final List<WikiSearchResult> directMatches = new ArrayList<>();
final Set<WikiSearchResult> directSearchMatches = new HashSet<>();
final Set<WikiSearchResult> indirectMatches = new HashSet<>();
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term.replace(' ', '_'));
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
directMatches.add(new WikiSearchResult(name, null));
} else {
indirectMatches.add(new WikiSearchResult(name, refName));
}
}
}
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER LIKE ? LIMIT 10")) {
stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%");
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
directSearchMatches.add(new WikiSearchResult(name, null));
} else {
indirectMatches.add(new WikiSearchResult(name, refName));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
directMatches.forEach(indirectMatches::remove);
indirectMatches.removeAll(directSearchMatches);
directMatches.forEach(directSearchMatches::remove);
directMatches.addAll(indirectMatches);
directMatches.addAll(directSearchMatches);
return directMatches;
}
private String capitalizeWikiString(String string) {
if (string.contains("_")) {
return Arrays.stream(string.split("_")).map(this::capitalizeWikiString).collect(Collectors.joining("_"));
}
if (string.length() < 2) {
return string.toUpperCase();
}
return Character.toUpperCase(string.charAt(0)) + string.substring(1).toLowerCase();
}
public List<String> spellCheck(String word) {
return spellChecker.correct(word);
}

View File

@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.search.results.SearchResultValuator;
import nu.marginalia.wmsa.edge.search.results.model.AccumulatedQueryResults;
import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator;
import nu.marginalia.wmsa.edge.search.results.UrlDeduplicator;
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
import org.apache.logging.log4j.util.Strings;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
@ -33,6 +34,7 @@ public class EdgeSearchOperator {
private static final Logger logger = LoggerFactory.getLogger(EdgeSearchOperator.class);
private final AssistantClient assistantClient;
private final EncyclopediaClient encyclopediaClient;
private final EdgeDataStoreDao edgeDataStoreDao;
private final EdgeIndexClient indexClient;
private final QueryFactory queryFactory;
@ -42,6 +44,7 @@ public class EdgeSearchOperator {
@Inject
public EdgeSearchOperator(AssistantClient assistantClient,
EncyclopediaClient encyclopediaClient,
EdgeDataStoreDao edgeDataStoreDao,
EdgeIndexClient indexClient,
QueryFactory queryFactory,
@ -50,6 +53,7 @@ public class EdgeSearchOperator {
) {
this.assistantClient = assistantClient;
this.encyclopediaClient = encyclopediaClient;
this.edgeDataStoreDao = edgeDataStoreDao;
this.indexClient = indexClient;
this.queryFactory = queryFactory;
@ -220,7 +224,7 @@ public class EdgeSearchOperator {
@NotNull
private Observable<WikiArticles> getWikiArticle(Context ctx, String humanQuery) {
return assistantClient
return encyclopediaClient
.encyclopediaLookup(ctx,
humanQuery.replaceAll("\\s+", "_")
.replaceAll("\"", "")

View File

@ -1,384 +0,0 @@
package nu.marginalia.wmsa.edge.tools;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.map.hash.TObjectIntHashMap;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.archive.archiver.ArchiveExtractor;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlStandardExtractor;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl;
import nu.marginalia.wmsa.edge.model.*;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
import org.apache.commons.lang3.tuple.Pair;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.mariadb.jdbc.Driver;
import java.io.File;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.UNKNOWN;
public class ConverterMain {
static final LinkedBlockingQueue<EdgeRawPageContents> processQueue = new LinkedBlockingQueue<>(20);
static final LinkedBlockingQueue<UploadJob> uploadQueue = new LinkedBlockingQueue<>(2);
static final TObjectIntHashMap<String> urlToIdMap = new TObjectIntHashMap<>(50_000_000, 0.5f, -1);
static final TObjectIntHashMap<String> domainToIdMap = new TObjectIntHashMap<>(5_000_000, 0.5f, -1);
static final TIntObjectHashMap<String> idToDomainMap = new TIntObjectHashMap<>(5_000_000, 0.5f, -1);
static HikariDataSource conn;
private static SearchIndexWriterImpl indexWriter;
private static DictionaryWriter dictionaryWriter;
@AllArgsConstructor
static class UploadJob {
EdgeId<EdgeDomain> domainId;
EdgeId<EdgeUrl> urlId;
EdgePageWordSet words;
int wordCount;
}
static volatile boolean running = true;
public static void main(String... args) {
org.mariadb.jdbc.Driver driver = new Driver();
dictionaryWriter = new DictionaryWriter(new File(args[0]), 1L << 30, true);
indexWriter = new SearchIndexWriterImpl(dictionaryWriter, new File(args[1]));
new Thread(ConverterMain::uploadThread, "Uploader").start();
for (int i = 0; i < 24; i++) {
new Thread(ConverterMain::processorThread, "Processor-"+i).start();
}
conn = new DatabaseModule().provideConnection();
System.out.println("Loading URLs and domains");
try (var c = conn.getConnection();
var getUrlsStmt = c.prepareStatement("SELECT EC_URL.ID, DOMAIN_ID, PROTO, URL FROM EC_URL WHERE VISITED");
var getDomainsStmt = c.prepareStatement("SELECT ID, URL_PART FROM EC_DOMAIN WHERE INDEXED>0")
) {
getUrlsStmt.setFetchSize(10_000);
getDomainsStmt.setFetchSize(10_000);
System.out.println("Fetch domains");
var domainRsp = getDomainsStmt.executeQuery();
while (domainRsp.next()) {
domainToIdMap.put(domainRsp.getString(2), domainRsp.getInt(1));
idToDomainMap.put(domainRsp.getInt(1), domainRsp.getString(2));
}
System.out.println("Fetch URLs");
var urlRsp = getUrlsStmt.executeQuery();
while (urlRsp.next()) {
String urlStr = urlRsp.getString(3) + "://" + idToDomainMap.get(urlRsp.getInt(2)) + urlRsp.getString(4);
urlToIdMap.put(urlStr, urlRsp.getInt(1));
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
// new Thread(ConverterMain::uploadThread, "Uploader").start();
//
// for (int i = 0; i < 24; i++) {
// new Thread(ConverterMain::processorThread, "Processor-"+i).start();
// }
System.out.println("Loaded URLs and domains");
new ArchiveExtractor(Path.of(args[2])).forEach(
page -> {
if (page.contentType.contentType.startsWith("application/xhtml")
|| page.contentType.contentType.startsWith("text/html")) {
try {
int domainId = domainToIdMap.get(page.url.domain.toString());
if (domainId >= 0 && page.redirectUrl == null) {
int urlId = urlToIdMap.get(page.url.toString());
int dataHash = page.data.hashCode();
try (var c = conn.getConnection();
var updateHash = c.prepareStatement("UPDATE EC_URL SET DATA_HASH=? WHERE ID=?"))
{
updateHash.setInt(1, dataHash);
updateHash.setInt(2, urlId);
updateHash.executeUpdate();
}
catch (Exception ex) {
ex.printStackTrace();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
});
running = false;
}
static final LanguageModels lm = new LanguageModels(
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"),
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
Path.of("/var/lib/wmsa/model/English.RDR"),
Path.of("/var/lib/wmsa/model/English.DICT"),
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
);
static final NGramDict dict = new NGramDict(lm);
private static final LanguageFilter languageFilter = new LanguageFilter();
private static final LinkParser linkParser = new LinkParser();
public static void processorThread() {
SentenceExtractor newSe = new SentenceExtractor(lm);
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
try {
while (running || !processQueue.isEmpty()) {
var job = processQueue.take();
if (job.data.length() > 512*1024) {
System.out.println(job.url + " too big, skipping");
}
var parsed = Jsoup.parse(job.data);
var text = parsed.text();
if (languageFilter.isBlockedUnicodeRange(text)) {
continue;
}
var dld = newSe.extractSentences(parsed.clone());
var keywords = documentKeywordExtractor.extractKeywords(dld);
int wc = dld.totalNumWords();
if (wc > 100) {
double languageAgreement = languageFilter.dictionaryAgreement(dld);
if (languageAgreement < 0.05) {
continue;
}
}
EdgeHtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(parsed.documentType());
if (UNKNOWN.equals(htmlStandard)) {
htmlStandard = HtmlStandardExtractor.sniffHtmlStandard(parsed);
}
int scriptTags = getScriptPenalty(parsed);
var featureSet = getFeatureSet(parsed, scriptTags, job.hasCookies);
addTags(keywords, htmlStandard, job.url, featureSet);
extractLinkWords(keywords, job.getUrl(), parsed);
uploadQueue.put(new UploadJob(
new EdgeId<>(domainToIdMap.get(job.url.domain.toString())),
new EdgeId<>(urlToIdMap.get(job.url.toString())),
keywords,
0
));
}
}
catch (InterruptedException ex) {
ex.printStackTrace();
}
}
private static Map<EdgeUrl, Set<String>> extractLinkWords(EdgePageWordSet keywords, EdgeUrl pageUrl, Document parsed) {
List<Pair<EdgeUrl, String>> urls = new ArrayList<>();
Set<String> linkKeywords = new HashSet<>();
Map<EdgeUrl, Set<String>> linkTextWords = new ConcurrentHashMap<>();
for (var tag : parsed.getElementsByTag("a")) {
if (!tag.hasAttr("href")) {
continue;
}
if (urls.size() > 100) {
break;
}
var linkOpt = linkParser.parseLink(pageUrl, tag);
if (linkOpt.isEmpty())
continue;
var link = linkOpt.get();
urls.add(Pair.of(link, tag.text()));
if (!Objects.equals(link.domain.domain, pageUrl.domain.domain)
&& linkKeywords.size() <= 25)
{
linkKeywords.add("links:" + link.domain.domain);
}
//
// Set<String> words = new HashSet<>();
//
// for (var sent : sentenceExtractor.extractSentencesFromString(tag.text())) {
// for (var keyword : keywordExtractor.getWordsFromSentence(sent)) {
// words.add(sent.constructWordFromSpan(keyword));
// }
// }
//
// linkTextWords.compute(link, (k, set) -> {
// if (set == null) return words;
// else { set.addAll(words); return set; }
// });
}
keywords.get(IndexBlock.Meta).addAll(linkKeywords);
if (WordPatterns.wordQualitiesPredicate.test(pageUrl.domain.domain.toLowerCase())) {
keywords.get(IndexBlock.Link).addJust(pageUrl.domain.domain.toLowerCase());
}
return linkTextWords;
}
private static int getScriptPenalty(Document parsed) {
var scriptTags = parsed.getElementsByTag("script");
String scriptText = scriptTags.html();
int badScript = 0;
if (scriptText.contains(".createElement(")) {
badScript = 1;
}
return scriptTags.size() + badScript + (scriptText.length())/1000;
}
static final List<String> trackers = List.of("adform.net",
"connect.facebook",
"googletagmanager.com",
"googlesyndication.com",
"google.com",
"twitter.com",
"smartadserver.com",
"doubleclick.com",
"2mdn.com",
"dmtry.com",
"bing.com",
"msn.com",
"amazon-adsystem.com",
"alexametrics.com",
"rubiconproject.com",
"chango.com",
"d5nxst8fruw4z.cloudfront.net",
"d31qbv1cthcecs.cloudfront.net",
"linkedin.com");
private static Set<HtmlFeature> getFeatureSet(Document parsed, int scriptTags, boolean cookies) {
Set<HtmlFeature> features = new HashSet<>();
if (scriptTags > 0) {
features.add(HtmlFeature.JS);
}
if (!parsed.getElementsByTag("object").isEmpty()
|| !parsed.getElementsByTag("audio").isEmpty()
|| !parsed.getElementsByTag("video").isEmpty()) {
features.add(HtmlFeature.MEDIA);
}
if (parsed.getElementsByTag("script").stream()
.filter(tag -> tag.attr("src") != null)
.anyMatch(tag -> trackers.stream().anyMatch(tracker -> tag.attr("src").contains(tracker)))) {
features.add(HtmlFeature.TRACKING);
}
if (parsed.getElementsByTag("script").html().contains("google-analytics.com")) {
features.add(HtmlFeature.TRACKING);
}
if (parsed.getElementsByTag("a").stream().map(e -> e.attr("href"))
.filter(Objects::nonNull)
.map(String::toLowerCase)
.anyMatch(href ->
href.contains("amzn.to/") || href.contains("amazon.com/"))) {
features.add(HtmlFeature.AFFILIATE_LINK);
}
if (cookies) {
features.add(HtmlFeature.COOKIES);
}
return features;
}
private static void addTags(EdgePageWordSet wordSet, EdgeHtmlStandard htmlStandard, EdgeUrl url, Set<HtmlFeature> features) {
List<String> tagWords = new ArrayList<>();
tagWords.add("format:"+htmlStandard.toString().toLowerCase());
tagWords.add("site:"+url.domain.toString().toLowerCase());
tagWords.add("proto:"+url.proto.toLowerCase());
tagWords.add("js:" + Boolean.toString(features.contains(HtmlFeature.JS)).toLowerCase());
if (features.contains(HtmlFeature.MEDIA)) {
tagWords.add("special:media");
}
if (features.contains(HtmlFeature.TRACKING)) {
tagWords.add("special:tracking");
}
if (features.contains(HtmlFeature.AFFILIATE_LINK)) {
tagWords.add("special:affiliate");
}
if (features.contains(HtmlFeature.COOKIES)) {
tagWords.add("special:cookies");
}
wordSet.append(IndexBlock.Meta, tagWords);
wordSet.append(IndexBlock.Words, tagWords);
}
@SneakyThrows
public static void uploadThread() {
while (running || !processQueue.isEmpty() || !uploadQueue.isEmpty()) {
var data = uploadQueue.take();
if (!data.words.isEmpty()) {
for (var words : data.words.values()) {
if (!words.getWords().isEmpty()) {
if (words.size() < 1000) {
indexWriter.put(data.domainId, data.urlId, words.block, words.words);
} else {
chunks(words.words, 1000).forEach(chunk -> {
indexWriter.put(data.domainId, data.urlId, words.block, chunk);
});
}
}
}
}
}
System.out.println("Closing");
dictionaryWriter.commitToDisk();
indexWriter.forceWrite();
dictionaryWriter.close();
indexWriter.close();
System.out.println("Done");
}
private static <T> List<List<T>> chunks(Collection<T> coll, int size) {
List<List<T>> ret = new ArrayList<>();
List<T> data = List.copyOf(coll);
for (int i = 0; i < data.size(); i+=size) {
ret.add(data.subList(i, Math.min(data.size(), i+size)));
}
return ret;
}
}

View File

@ -1,142 +0,0 @@
package nu.marginalia.wmsa.edge.tools;
import gnu.trove.set.hash.TLongHashSet;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.archive.archiver.ArchiveExtractor;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.KeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
import opennlp.tools.stemmer.PorterStemmer;
import org.jsoup.Jsoup;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Path;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicLong;
public class TermFrequencyCounterMain {
static final LinkedBlockingQueue<EdgeRawPageContents> processQueue = new LinkedBlockingQueue<>(20);
public static final String OUTPUT_FILE = "/var/lib/wmsa/archive/tfreq-2022-04-04.bin";
public static final String ARCHIVE_PATH = "/var/lib/wmsa/archive/webpage"; // "/mnt/storage/wmsa/archive/webpage/"
@SneakyThrows
public static void main(String... args) {
List<Thread> pt = new ArrayList<>();
for (int i = 0; i < 20; i++) {
pt.add(new Thread(TermFrequencyCounterMain::processorThread));
}
pt.forEach(Thread::start);
AtomicLong docsTotal = new AtomicLong();
new ArchiveExtractor(Path.of(ARCHIVE_PATH)).forEach(
page -> {
if (page.contentType.contentType.contains("html")
&& page.isAfter("2022-03-15T")) {
try {
long dt = docsTotal.incrementAndGet();
if (dt == 0) {
System.out.println(docsTotal.get() + " - " + termFreq.size());
}
if ((dt % 5) != 0) {
processQueue.put(page);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
});
running = false;
System.out.println("Waiting for wrap-up");
Thread.sleep(36000);
for (Thread thread : pt) {
thread.interrupt();
}
for (Thread thread : pt) {
thread.join();
}
System.out.println("Total documents = " + docsTotal.get());
System.out.println("Writing Frequencies");
try (var dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(OUTPUT_FILE)))
) {
synchronized (termFreq) {
for (var entry : termFreq.entrySet()) {
if (entry.getValue() > 5) {
dos.writeLong(entry.getKey());
dos.writeLong(entry.getValue());
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("All done!");
}
public static final ConcurrentHashMap<Long, Integer> termFreq = new ConcurrentHashMap<>();
public static final LanguageModels lm = new LanguageModels(
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
Path.of("/var/lib/wmsa/model/English.RDR"),
Path.of("/var/lib/wmsa/model/English.DICT"),
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
);
public static volatile boolean running = true;
public static void processorThread() {
var ke = new KeywordExtractor();
var se = new SentenceExtractor(lm);
var ps = new PorterStemmer();
try {
TLongHashSet words = new TLongHashSet(10000);
while (running || !processQueue.isEmpty()) {
var job = processQueue.take();
var sentence = se.extractSentences(Jsoup.parse(job.data));
for (var sent : sentence.sentences) {
var keywords = ke.getKeywordsFromSentence(sent);
for (int i = 0; i < keywords.length; i++) {
if (keywords[i].size() > 1) {
words.add(NGramDict.longHash(sent.constructStemmedWordFromSpan(keywords[i]).getBytes()));
}
}
for (String word : sent.wordsLowerCase) {
words.add(NGramDict.longHash(ps.stem(word).getBytes()));
}
words.forEach(l -> {
termFreq.merge(l, 1, Integer::sum);
return true;
});
words.clear();
}
}
}
catch (InterruptedException ex) {
ex.printStackTrace();
}
}
}

View File

@ -2,10 +2,10 @@ package nu.marginalia.wmsa.edge.tools;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
import org.jsoup.Jsoup;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
@ -25,7 +25,7 @@ public class ZimConverterMain {
static final LinkedBlockingQueue<ConversionJob> jobQueue = new LinkedBlockingQueue<>(100);
static final LinkedBlockingQueue<String> analysisQueue = new LinkedBlockingQueue<>(100);
static boolean hasData = true;
static final ArchiveClient archiveClient = new ArchiveClient();
static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient();
static NGramDict dict = new NGramDict(new LanguageModels(
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"),
@ -60,7 +60,7 @@ public class ZimConverterMain {
// convertJust("Plotinus");
// convertJust("C++");
convertAll(args);
archiveClient.close();
encyclopediaClient.close();
}
@SneakyThrows
@ -108,7 +108,7 @@ public class ZimConverterMain {
}
private static void convertAll(String[] args) throws IOException {
archiveClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0]));
encyclopediaClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0]));
var zr = new ZIMReader(new ZIMFile(args[1]));
// var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
@ -142,7 +142,7 @@ public class ZimConverterMain {
}, p -> true);
hasData = false;
archiveClient.close();
encyclopediaClient.close();
}
@SneakyThrows

View File

@ -0,0 +1,34 @@
package nu.marginalia.wmsa.encyclopedia;
import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.wmsa.client.AbstractDynamicClient;
import nu.marginalia.wmsa.client.HttpStatusCode;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
import okhttp3.MediaType;
import org.eclipse.jetty.util.UrlEncoded;
import javax.annotation.CheckReturnValue;
public class EncyclopediaClient extends AbstractDynamicClient {
public EncyclopediaClient() {
super(ServiceDescriptor.ENCYCLOPEDIA);
}
@CheckReturnValue
public Observable<HttpStatusCode> submitWiki(Context ctx, String url, String data) {
return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8"));
}
@CheckReturnValue
public Observable<Boolean> hasWiki(Context ctx, String url) {
return super.get(ctx, "/wiki/has?url="+ UrlEncoded.encodeString(url), Boolean.class);
}
@CheckReturnValue
public Observable<WikiArticles> encyclopediaLookup(Context ctx, String word) {
return super.get(ctx,"/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class);
}
}

View File

@ -0,0 +1,160 @@
package nu.marginalia.wmsa.encyclopedia;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
import nu.marginalia.wmsa.edge.assistant.dict.WikiSearchResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.stream.Collectors;
public class EncyclopediaDao {
private HikariDataSource dataSource;
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaDao.class);
@Inject
public EncyclopediaDao(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
public WikiArticles encyclopedia(String term) {
WikiArticles response = new WikiArticles();
response.entries = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
var stmt = connection.prepareStatement("SELECT DISTINCT(NAME_LOWER) FROM REF_WIKI_TITLE WHERE NAME_LOWER=?");
stmt.setString(1, term);
var rsp = stmt.executeQuery();
while (rsp.next()) {
response.entries.add(capitalizeWikiString(rsp.getString(1)));
}
}
catch (Exception ex) {
logger.error("Failed to fetch articles", ex);
return new WikiArticles();
}
return response;
}
public Optional<String> resolveEncylopediaRedirect(String term) {
final List<String> matches = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term);
var rsp = stmt.executeQuery();
while (rsp.next()) {
if (term.equals(rsp.getString(1))
|| rsp.getString(2) == null) {
return Optional.ofNullable(rsp.getString(2));
} else {
matches.add(rsp.getString(2));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
if (!matches.isEmpty()) {
return Optional.of(matches.get(0));
}
return Optional.empty();
}
public Optional<WikiSearchResult> findEncyclopediaPageDirect(String term) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term.replace(' ', '_'));
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
return Optional.of(new WikiSearchResult(name, null));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
return Optional.empty();
}
public List<WikiSearchResult> findEncyclopediaPages(String term) {
final List<WikiSearchResult> directMatches = new ArrayList<>();
final Set<WikiSearchResult> directSearchMatches = new HashSet<>();
final Set<WikiSearchResult> indirectMatches = new HashSet<>();
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
stmt.setString(1, term.replace(' ', '_'));
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
directMatches.add(new WikiSearchResult(name, null));
} else {
indirectMatches.add(new WikiSearchResult(name, refName));
}
}
}
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER LIKE ? LIMIT 10")) {
stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%");
var rsp = stmt.executeQuery();
while (rsp.next()) {
String name = rsp.getString(1);
String refName = rsp.getString(2);
if (refName == null) {
directSearchMatches.add(new WikiSearchResult(name, null));
} else {
indirectMatches.add(new WikiSearchResult(name, refName));
}
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
directMatches.forEach(indirectMatches::remove);
indirectMatches.removeAll(directSearchMatches);
directMatches.forEach(directSearchMatches::remove);
directMatches.addAll(indirectMatches);
directMatches.addAll(directSearchMatches);
return directMatches;
}
private String capitalizeWikiString(String string) {
if (string.contains("_")) {
return Arrays.stream(string.split("_")).map(this::capitalizeWikiString).collect(Collectors.joining("_"));
}
if (string.length() < 2) {
return string.toUpperCase();
}
return Character.toUpperCase(string.charAt(0)) + string.substring(1).toLowerCase();
}
}

View File

@ -0,0 +1,26 @@
package nu.marginalia.wmsa.encyclopedia;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.wmsa.configuration.MainClass;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
public class EncyclopediaMain extends MainClass {
private final EncyclopediaService service;
public static void main(String... args) {
init(ServiceDescriptor.ENCYCLOPEDIA, args);
Injector injector = Guice.createInjector(
new ConfigurationModule());
injector.getInstance(EncyclopediaMain.class);
}
@Inject
public EncyclopediaMain(EncyclopediaService service) {
this.service = service;
}
}

View File

@ -0,0 +1,202 @@
package nu.marginalia.wmsa.encyclopedia;
import com.google.inject.name.Named;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
public class EncyclopediaService extends Service {
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaService.class);
private final MustacheRenderer<String> wikiErrorPageRenderer;
private final MustacheRenderer<Object> wikiSearchResultRenderer;
private Path wikiPath;
private EncyclopediaDao encyclopediaDao;
public EncyclopediaService(@Named("service-host") String ip,
@Named("service-port") Integer port,
@Named("wiki-path") Path wikiPath,
EncyclopediaDao encyclopediaDao,
RendererFactory rendererFactory,
Initialization initialization,
MetricsServer metricsServer)
throws IOException {
super(ip, port, initialization, metricsServer);
this.wikiPath = wikiPath;
this.encyclopediaDao = encyclopediaDao;
if (rendererFactory != null) {
wikiErrorPageRenderer = rendererFactory.renderer("encyclopedia/wiki-error");
wikiSearchResultRenderer = rendererFactory.renderer("encyclopedia/wiki-search");
}
else {
wikiErrorPageRenderer = null;
wikiSearchResultRenderer = null;
}
Spark.get("/public/wiki/*", this::getWikiPage);
Spark.get("/public/wiki-search", this::searchWikiPage);
Spark.get("/wiki/has", this::pathWikiHas);
Spark.post("/wiki/submit", this::pathWikiSubmit);
}
@SneakyThrows
private Object getWikiPage(Request req, Response rsp) {
final String[] splats = req.splat();
if (splats.length == 0)
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
final String name = splats[0];
String pageName = encyclopediaDao.resolveEncylopediaRedirect(name).orElse(name);
logger.info("Resolved {} -> {}", name, pageName);
return wikiGet(pageName)
.or(() -> resolveWikiPageNameWrongCase(name))
.orElseGet(() -> renderSearchPage(name));
}
private Optional<String> resolveWikiPageNameWrongCase(String name) {
var rsp = encyclopediaDao.findEncyclopediaPageDirect(name);
if (rsp.isEmpty()) {
return Optional.of(renderSearchPage(name));
}
name = rsp.get().getInternalName();
return wikiGet(name);
}
private String renderSearchPage(String s) {
return wikiSearchResultRenderer.render(
Map.of("query", s,
"error", "true",
"results", encyclopediaDao.findEncyclopediaPages(s)));
}
@SneakyThrows
private Object searchWikiPage(Request req, Response rsp) {
final var ctx = Context.fromRequest(req);
String term = req.queryParams("query");
if (null == term) {
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
return "";
}
return wikiSearchResultRenderer.render(
Map.of("query", term,
"results",
encyclopediaDao.findEncyclopediaPages(term))
);
}
private Path getWikiFilename(Path base, String url) {
Path p = base;
int urlHash = url.hashCode();
p = p.resolve(Integer.toString(urlHash & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>8) & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>16) & 0xFF));
p = p.resolve(Integer.toString((urlHash>>>24) & 0xFF));
String fileName = url.chars()
.mapToObj(this::encodeUrlChar)
.collect(Collectors.joining());
if (fileName.length() > 128) {
fileName = fileName.substring(0, 128) + (((long)urlHash)&0xFFFFFFFFL);
}
return p.resolve(fileName + ".gz");
}
private String encodeUrlChar(int i) {
if (i >= 'a' && i <= 'z') {
return Character.toString(i);
}
if (i >= 'A' && i <= 'Z') {
return Character.toString(i);
}
if (i >= '0' && i <= '9') {
return Character.toString(i);
}
if (i == '.') {
return Character.toString(i);
}
else {
return String.format("%%%2X", i);
}
}
@SneakyThrows
private Object pathWikiHas(Request request, Response response) {
return Files.exists(getWikiFilename(wikiPath, request.queryParams("url")));
}
@SneakyThrows
private Optional<String> wikiGet(String name) {
var filename = getWikiFilename(wikiPath, name);
if (Files.exists(filename)) {
try (var stream = new GZIPInputStream(new FileInputStream(filename.toFile()))) {
return Optional.of(new String(stream.readAllBytes()));
}
} else {
return Optional.empty();
}
}
@SneakyThrows
private Object pathWikiSubmit(Request request, Response response) {
byte[] data = request.bodyAsBytes();
String wikiUrl = request.queryParams("url");
Path filename = getWikiFilename(wikiPath, wikiUrl);
Files.createDirectories(filename.getParent());
System.out.println(new String(data));
logger.debug("Writing {} to {}", wikiUrl, filename);
try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) {
gos.write(data);
gos.flush();
}
return "ok";
}
}

View File

@ -50,8 +50,6 @@ class ServiceTest {
new DictionaryService(dataSource, new SpellChecker()),
new MathParser(),
new Units(new MathParser()),
null,
null,
new ScreenshotService(null), null);
Spark.awaitInitialization();

View File

@ -1,72 +0,0 @@
package nu.marginalia.wmsa.edge.archive;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.edge.archive.archiver.Archiver;
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import org.junit.jupiter.api.parallel.ExecutionMode;
import spark.Spark;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import static nu.marginalia.util.TestUtil.getPort;
import static nu.marginalia.util.test.TestUtil.clearTempDir;
@Execution(ExecutionMode.SAME_THREAD)
public class ArchiveTest {
static EdgeArchiveService service;
static final int testPort = getPort();
private static Path tempPath;
private static Path tempPath2;
private static ArchiveClient archiveClient;
private static Archiver archiver;
@BeforeAll
public static void setUpClass() throws IOException {
Spark.port(testPort);
System.setProperty("service-name", "edge-archive");
archiveClient = new ArchiveClient();
archiveClient.setServiceRoute("127.0.0.1", testPort);
tempPath = Files.createTempDirectory("archiveTest");
tempPath2 = Files.createTempDirectory("wikiTest");
archiver = new Archiver(tempPath, 10);
service = new EdgeArchiveService("127.0.0.1", testPort,
tempPath,
archiver,
new Initialization(), null);
Spark.awaitInitialization();
}
@AfterAll
public static void tearDown() throws Exception {
archiver.close();
archiveClient.close();
clearTempDir(tempPath);
clearTempDir(tempPath2);
}
@SneakyThrows
@Test
public void testWiki() {
var url = "Plato_(Disambiguation)";
Assertions.assertFalse(archiveClient.hasWiki(Context.internal(), url).blockingFirst());
archiveClient.submitWiki(Context.internal(), url, "<h1>Hello</h1>").blockingFirst();
Assertions.assertTrue(archiveClient.hasWiki(Context.internal(), url).blockingFirst());
Assertions.assertEquals("<h1>Hello</h1>", archiveClient.getWiki(Context.internal(), url).blockingFirst());
}
}

View File

@ -1,17 +0,0 @@
package nu.marginalia.wmsa.edge.archive.archiver;
import org.junit.jupiter.api.*;
import java.nio.file.Path;
public class ArchiverTest {
@Test
public void testArchiver() throws Exception {
Archiver archiver = new Archiver(Path.of("/tmp/"), 3);
archiver.writeData(new ArchivedFile("file1", "Hey".getBytes()));
archiver.writeData(new ArchivedFile("file2", "Hey".getBytes()));
archiver.writeData(new ArchivedFile("file3", "Hey".getBytes()));
archiver.writeData(new ArchivedFile("file4", "Hey".getBytes()));
archiver.close();
}
}

View File

@ -60,7 +60,6 @@ class AssistantTest {
new DictionaryService(dataSource, new SpellChecker()),
new MathParser(),
new Units(new MathParser()),
null, null,
new ScreenshotService(null), null);
Spark.awaitInitialization();
@ -77,12 +76,6 @@ class AssistantTest {
Spark.awaitStop();
}
@Test
public void testEncyclopedia() {
var result = client.encyclopediaLookup(Context.internal(), "plato").blockingFirst();
System.out.println(result);
assertTrue(result.entries.size() >= 1);
}
@Test
public void testSpellCheck() {
var result = client.spellCheck(Context.internal(), "plato").blockingFirst();