WIP: Killing off Archive service, adding new Encyclopedia service consisting largely of what Archive was and a few features from Assistant.
This commit is contained in:
parent
61ef2b06b0
commit
e7b4ac0d34
@ -6,11 +6,11 @@ import nu.marginalia.wmsa.configuration.command.Command;
|
||||
import nu.marginalia.wmsa.configuration.command.ListCommand;
|
||||
import nu.marginalia.wmsa.configuration.command.StartCommand;
|
||||
import nu.marginalia.wmsa.configuration.command.VersionCommand;
|
||||
import nu.marginalia.wmsa.edge.archive.EdgeArchiveMain;
|
||||
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
|
||||
import nu.marginalia.wmsa.edge.dating.DatingMain;
|
||||
import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
|
||||
import nu.marginalia.wmsa.edge.search.EdgeSearchMain;
|
||||
import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain;
|
||||
import nu.marginalia.wmsa.memex.MemexMain;
|
||||
import nu.marginalia.wmsa.podcasts.PodcastScraperMain;
|
||||
import nu.marginalia.wmsa.renderer.RendererMain;
|
||||
@ -33,11 +33,12 @@ public enum ServiceDescriptor {
|
||||
|
||||
EDGE_INDEX("edge-index", 5021, EdgeIndexMain.class),
|
||||
EDGE_SEARCH("edge-search", 5023, EdgeSearchMain.class),
|
||||
EDGE_ARCHIVE("edge-archive", 5024, EdgeArchiveMain.class),
|
||||
EDGE_ASSISTANT("edge-assistant", 5025, EdgeAssistantMain.class),
|
||||
|
||||
EDGE_MEMEX("memex", 5030, MemexMain.class),
|
||||
|
||||
ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class),
|
||||
|
||||
DATING("dating", 5070, DatingMain.class),
|
||||
|
||||
TEST_1("test-1", 0, null),
|
||||
|
@ -1,33 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.archive;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.wmsa.configuration.MainClass;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
|
||||
public class EdgeArchiveMain extends MainClass {
|
||||
private final EdgeArchiveService service;
|
||||
|
||||
@Inject
|
||||
public EdgeArchiveMain(EdgeArchiveService service) {
|
||||
this.service = service;
|
||||
}
|
||||
|
||||
public static void main(String... args) {
|
||||
init(ServiceDescriptor.EDGE_ARCHIVE, args);
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new EdgeArchiveModule(),
|
||||
new ConfigurationModule(),
|
||||
new DatabaseModule()
|
||||
);
|
||||
|
||||
injector.getInstance(EdgeArchiveMain.class);
|
||||
injector.getInstance(Initialization.class).setReady();
|
||||
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.archive;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.name.Names;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class EdgeArchiveModule extends AbstractModule {
|
||||
public void configure() {
|
||||
bind(Path.class).annotatedWith(Names.named("archive-path")).toInstance(Path.of("/var/lib/wmsa/archive/webpage/"));
|
||||
bind(Path.class).annotatedWith(Names.named("wiki-path")).toInstance(Path.of("/var/lib/wmsa/archive.fast/wiki/"));
|
||||
bind(Integer.class).annotatedWith(Names.named("archive-size")).toInstance(10_000);
|
||||
}
|
||||
|
||||
}
|
@ -1,180 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.archive;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import io.prometheus.client.Histogram;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.edge.archive.archiver.ArchivedFile;
|
||||
import nu.marginalia.wmsa.edge.archive.archiver.Archiver;
|
||||
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
public class EdgeArchiveService extends Service {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
|
||||
private static final Histogram wmsa_archive_store_time = Histogram.build().name("wmsa_archive_store_time").help("-").register();
|
||||
private static final Histogram wmsa_archive_fetch_time = Histogram.build().name("wmsa_archive_fetch_time").help("-").register();
|
||||
|
||||
private final Path wikiPath;
|
||||
private final Archiver archiver;
|
||||
|
||||
@SneakyThrows
|
||||
@Inject
|
||||
public EdgeArchiveService(@Named("service-host") String ip,
|
||||
@Named("service-port") Integer port,
|
||||
@Named("wiki-path") Path wikiPath,
|
||||
Archiver archiver,
|
||||
Initialization initialization,
|
||||
MetricsServer metricsServer)
|
||||
{
|
||||
super(ip, port, initialization, metricsServer);
|
||||
this.wikiPath = wikiPath;
|
||||
this.archiver = archiver;
|
||||
|
||||
Spark.staticFiles.expireTime(600);
|
||||
|
||||
Spark.post("/page/submit", this::pathPageSubmit);
|
||||
|
||||
Spark.post("/wiki/submit", this::pathWikiSubmit);
|
||||
Spark.get("/wiki/has", this::pathWikiHas);
|
||||
Spark.get("/wiki/get", this::pathWikiGet);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object pathPageSubmit(Request request, Response response) {
|
||||
var timer = wmsa_archive_store_time.startTimer();
|
||||
try {
|
||||
var body = request.body();
|
||||
var data = gson.fromJson(body, EdgeArchiveSubmissionReq.class);
|
||||
|
||||
String domainNamePart = data.getUrl().domain.domain.length() > 32 ? data.getUrl().domain.domain.substring(0, 32) : data.getUrl().domain.domain;
|
||||
String fileName = String.format("%s-%10d", domainNamePart, data.getUrl().hashCode());
|
||||
|
||||
archiver.writeData(new ArchivedFile(fileName, body.getBytes()));
|
||||
|
||||
return "ok";
|
||||
} finally {
|
||||
timer.observeDuration();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Object pathWikiSubmit(Request request, Response response) {
|
||||
var timer = wmsa_archive_store_time.startTimer();
|
||||
|
||||
try {
|
||||
byte[] data = request.bodyAsBytes();
|
||||
|
||||
String wikiUrl = request.queryParams("url");
|
||||
Path filename = getWikiFilename(wikiPath, wikiUrl);
|
||||
|
||||
Files.createDirectories(filename.getParent());
|
||||
|
||||
System.out.println(new String(data));
|
||||
logger.debug("Writing {} to {}", wikiUrl, filename);
|
||||
|
||||
try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) {
|
||||
gos.write(data);
|
||||
gos.flush();
|
||||
}
|
||||
|
||||
return "ok";
|
||||
} finally {
|
||||
timer.observeDuration();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Path getWikiFilename(Path base, String url) {
|
||||
Path p = base;
|
||||
|
||||
int urlHash = url.hashCode();
|
||||
|
||||
p = p.resolve(Integer.toString(urlHash & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>8) & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>16) & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>24) & 0xFF));
|
||||
|
||||
String fileName = url.chars()
|
||||
.mapToObj(this::encodeUrlChar)
|
||||
.collect(Collectors.joining());
|
||||
|
||||
if (fileName.length() > 128) {
|
||||
fileName = fileName.substring(0, 128) + (((long)urlHash)&0xFFFFFFFFL);
|
||||
}
|
||||
|
||||
return p.resolve(fileName + ".gz");
|
||||
}
|
||||
|
||||
|
||||
private String encodeUrlChar(int i) {
|
||||
if (i >= 'a' && i <= 'z') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i >= 'A' && i <= 'Z') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i >= '0' && i <= '9') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i == '.') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
else {
|
||||
return String.format("%%%2X", i);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object pathWikiHas(Request request, Response response) {
|
||||
return Files.exists(getWikiFilename(wikiPath, request.queryParams("url")));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private String pathWikiGet(Request request, Response response) {
|
||||
var timer = wmsa_archive_fetch_time.startTimer();
|
||||
|
||||
try {
|
||||
String url = request.queryParams("url");
|
||||
|
||||
var filename = getWikiFilename(wikiPath, url);
|
||||
|
||||
if (Files.exists(filename)) {
|
||||
try (var stream = new GZIPInputStream(new FileInputStream(filename.toFile()))) {
|
||||
return new String(stream.readAllBytes());
|
||||
}
|
||||
} else {
|
||||
Spark.halt(404);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
finally {
|
||||
timer.observeDuration();
|
||||
}
|
||||
}
|
||||
}
|
@ -1,65 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.archive.archiver;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public class ArchiveExtractor {
|
||||
private final Path archivePath;
|
||||
private final String arhivePattern = "archive-%04d.tar.gz";
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
|
||||
public ArchiveExtractor(Path archivePath) {
|
||||
this.archivePath = archivePath;
|
||||
|
||||
}
|
||||
|
||||
public void forEach(Consumer<EdgeRawPageContents> contents) {
|
||||
for (int i = 0; ; ++i) {
|
||||
var fn = getArchiveFile(i);
|
||||
logger.info("{}", fn);
|
||||
if (!Files.exists(fn)) {
|
||||
break;
|
||||
}
|
||||
try (var stream = new TarArchiveInputStream(new GzipCompressorInputStream(new BufferedInputStream(new FileInputStream(fn.toFile()))))) {
|
||||
TarArchiveEntry entry;
|
||||
while ((entry = stream.getNextTarEntry()) != null) {
|
||||
if (entry.isFile()) {
|
||||
try {
|
||||
var obj = gson.fromJson(new InputStreamReader(stream), EdgeArchiveSubmissionReq.class);
|
||||
if (obj != null) {
|
||||
contents.accept(obj.getData());
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Could not unpack {} - {} {}", entry.getName(), ex.getClass().getSimpleName(), ex.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Path getArchiveFile(int number) {
|
||||
final String fileName = String.format(arhivePattern, number);
|
||||
return archivePath.resolve(fileName);
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.archive.archiver;
|
||||
|
||||
|
||||
public record ArchivedFile(String filename,byte[] data ) {
|
||||
}
|
@ -1,113 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.archive.archiver;
|
||||
|
||||
import com.google.inject.name.Named;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@Singleton
|
||||
public class Archiver implements AutoCloseable {
|
||||
private final Path archivePath;
|
||||
private final int filesPerArchive;
|
||||
private final String arhivePattern = "archive-%04d.tar.gz";
|
||||
|
||||
private final LinkedBlockingDeque<ArchivedFile> writeQueue = new LinkedBlockingDeque<>(10);
|
||||
private final Thread writeThread;
|
||||
|
||||
private volatile int archiveNumber;
|
||||
private volatile boolean running;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public Archiver(@Named("archive-path") Path archivePath, @Named("archive-size") Integer filesPerArchive) {
|
||||
this.archivePath = archivePath;
|
||||
this.filesPerArchive = filesPerArchive;
|
||||
|
||||
if (!Files.exists(archivePath)) {
|
||||
throw new IllegalArgumentException("Archive path does not exist");
|
||||
}
|
||||
for (int i = 0;; ++i) {
|
||||
if (!Files.exists(getArchiveFile(i))) {
|
||||
archiveNumber = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
running = true;
|
||||
writeThread = new Thread(this::writeThreadMain, "ArchiveWriteThread");
|
||||
writeThread.start();
|
||||
}
|
||||
|
||||
private Path getArchiveFile(int number) {
|
||||
final String fileName = String.format(arhivePattern, number);
|
||||
return archivePath.resolve(fileName);
|
||||
}
|
||||
|
||||
public void writeData(ArchivedFile file) throws InterruptedException {
|
||||
if (!running) throw new IllegalStateException("Archiver is closing or closed");
|
||||
writeQueue.put(file);
|
||||
}
|
||||
|
||||
private void writeThreadMain() {
|
||||
try {
|
||||
while (running || !writeQueue.isEmpty()) {
|
||||
writeToFile(archiveNumber);
|
||||
archiveNumber++;
|
||||
}
|
||||
running = false;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Uncaught exception in writer thread!!");
|
||||
}
|
||||
}
|
||||
|
||||
private void writeToFile(int archiveNumber) {
|
||||
var archiveFile = getArchiveFile(archiveNumber);
|
||||
|
||||
logger.info("Switching to file {}", archiveFile);
|
||||
|
||||
try (TarArchiveOutputStream taos = new TarArchiveOutputStream(new GzipCompressorOutputStream(new FileOutputStream(archiveFile.toFile())))) {
|
||||
for (int i = 0; i < filesPerArchive; i++) {
|
||||
|
||||
ArchivedFile writeJob = null;
|
||||
while (writeJob == null) {
|
||||
writeJob = writeQueue.poll(1, TimeUnit.SECONDS);
|
||||
if (!running) return;
|
||||
}
|
||||
|
||||
var entry = new TarArchiveEntry(String.format("%06d-%s", i, writeJob.filename()));
|
||||
entry.setSize(writeJob.data().length);
|
||||
taos.putArchiveEntry(entry);
|
||||
logger.debug("Writing {} to {}", writeJob.filename(), archiveFile);
|
||||
try (var bais = new ByteArrayInputStream(writeJob.data())) {
|
||||
IOUtils.copy(bais, taos);
|
||||
}
|
||||
taos.closeArchiveEntry();
|
||||
}
|
||||
taos.finish();
|
||||
logger.debug("Finishing {}", archiveFile);
|
||||
} catch (Exception e) {
|
||||
logger.error("Error", e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
running = false;
|
||||
writeThread.join();
|
||||
}
|
||||
}
|
@ -1,56 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.archive.client;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
||||
import nu.marginalia.wmsa.client.HttpStatusCode;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import okhttp3.MediaType;
|
||||
import org.eclipse.jetty.util.UrlEncoded;
|
||||
|
||||
import javax.annotation.CheckReturnValue;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
@Singleton
|
||||
public class ArchiveClient extends AbstractDynamicClient {
|
||||
|
||||
private final Semaphore submitPageSem = new Semaphore(3, true);
|
||||
|
||||
@Inject
|
||||
public ArchiveClient() {
|
||||
super(ServiceDescriptor.EDGE_ARCHIVE);
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public void submitPage(Context ctx, EdgeUrl url, EdgeRawPageContents data) throws InterruptedException {
|
||||
try {
|
||||
submitPageSem.acquire();
|
||||
super.post(ctx, "/page/submit", new EdgeArchiveSubmissionReq(url, data)).blockingSubscribe();
|
||||
}
|
||||
finally {
|
||||
submitPageSem.release();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<HttpStatusCode> submitWiki(Context ctx, String url, String data) {
|
||||
return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8"));
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<Boolean> hasWiki(Context ctx, String url) {
|
||||
return super.get(ctx, "/wiki/has?url="+UrlEncoded.encodeString(url), Boolean.class);
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<String> getWiki(Context ctx, String url) {
|
||||
return super.get(ctx, "/wiki/get?url="+UrlEncoded.encodeString(url));
|
||||
}
|
||||
|
||||
}
|
@ -1,13 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.archive.request;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
@AllArgsConstructor @Getter @ToString
|
||||
public class EdgeArchiveSubmissionReq {
|
||||
EdgeUrl url;
|
||||
EdgeRawPageContents data;
|
||||
}
|
@ -4,36 +4,27 @@ import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.server.*;
|
||||
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.DictionaryService;
|
||||
import nu.marginalia.wmsa.edge.assistant.eval.MathParser;
|
||||
import nu.marginalia.wmsa.edge.assistant.eval.Units;
|
||||
import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
|
||||
import nu.marginalia.wmsa.edge.assistant.suggest.Suggestions;
|
||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
public class EdgeAssistantService extends Service {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
private final Units units;
|
||||
private final DictionaryService dictionaryService;
|
||||
private final MathParser mathParser;
|
||||
private final ArchiveClient archiveClient;
|
||||
private final ScreenshotService screenshotService;
|
||||
private final MustacheRenderer<String> wikiErrorPageRenderer;
|
||||
private final MustacheRenderer<Object> wikiSearchResultRenderer;
|
||||
private final Suggestions suggestions;
|
||||
|
||||
@SneakyThrows
|
||||
@ -45,40 +36,22 @@ public class EdgeAssistantService extends Service {
|
||||
DictionaryService dictionaryService,
|
||||
MathParser mathParser,
|
||||
Units units,
|
||||
ArchiveClient archiveClient,
|
||||
RendererFactory rendererFactory,
|
||||
ScreenshotService screenshotService,
|
||||
Suggestions suggestions
|
||||
)
|
||||
{
|
||||
super(ip, port, initialization, metricsServer);
|
||||
this.dictionaryService = dictionaryService;
|
||||
this.mathParser = mathParser;
|
||||
this.units = units;
|
||||
this.archiveClient = archiveClient;
|
||||
this.screenshotService = screenshotService;
|
||||
this.suggestions = suggestions;
|
||||
|
||||
Spark.staticFiles.expireTime(600);
|
||||
|
||||
if (rendererFactory != null) {
|
||||
wikiErrorPageRenderer = rendererFactory.renderer("encyclopedia/wiki-error");
|
||||
wikiSearchResultRenderer = rendererFactory.renderer("encyclopedia/wiki-search");
|
||||
}
|
||||
else {
|
||||
wikiErrorPageRenderer = null;
|
||||
wikiSearchResultRenderer = null;
|
||||
}
|
||||
|
||||
Spark.get("/public/wiki/*", this::getWikiPage);
|
||||
Spark.get("/public/wiki-search", this::searchWikiPage);
|
||||
|
||||
Spark.get("/public/screenshot/:id", screenshotService::serveScreenshotRequest);
|
||||
Spark.get("/screenshot/:id", screenshotService::serveScreenshotRequest);
|
||||
|
||||
Spark.get("/dictionary/:word", (req, rsp) -> dictionaryService.define(req.params("word")), this::convertToJson);
|
||||
Spark.get("/spell-check/:term", (req, rsp) -> dictionaryService.spellCheck(req.params("term").toLowerCase()), this::convertToJson);
|
||||
Spark.get("/encyclopedia/:term", (req, rsp) -> dictionaryService.encyclopedia(req.params("term")), this::convertToJson);
|
||||
Spark.get("/unit-conversion", (req, rsp) -> unitConversion(
|
||||
rsp,
|
||||
req.queryParams("value"),
|
||||
@ -106,57 +79,6 @@ public class EdgeAssistantService extends Service {
|
||||
return suggestions.getSuggestions(10, param);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object getWikiPage(Request req, Response rsp) {
|
||||
final var ctx = Context.fromRequest(req);
|
||||
|
||||
final String[] splats = req.splat();
|
||||
if (splats.length == 0)
|
||||
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
|
||||
|
||||
|
||||
final String s = splats[0];
|
||||
|
||||
String pageName = dictionaryService.resolveEncylopediaRedirect(s).orElse(s);
|
||||
logger.info("Resolved {} -> {}", s, pageName);
|
||||
return archiveClient.getWiki(ctx, pageName)
|
||||
.onErrorResumeWith(resolveWikiPageNameWrongCase(ctx, s))
|
||||
.blockingFirst();
|
||||
}
|
||||
|
||||
private Observable<String> resolveWikiPageNameWrongCase(Context ctx, String s) {
|
||||
var rsp = dictionaryService.findEncyclopediaPageDirect(s);
|
||||
if (rsp.isEmpty()) {
|
||||
return renderSearchPage(s);
|
||||
}
|
||||
return archiveClient.getWiki(ctx, rsp.get().getInternalName())
|
||||
.onErrorResumeWith(renderSearchPage(s));
|
||||
}
|
||||
|
||||
private Observable<String> renderSearchPage(String s) {
|
||||
return Observable.fromCallable(() -> wikiSearchResultRenderer.render(
|
||||
Map.of("query", s,
|
||||
"error", "true",
|
||||
"results", dictionaryService.findEncyclopediaPages(s))));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object searchWikiPage(Request req, Response rsp) {
|
||||
final var ctx = Context.fromRequest(req);
|
||||
|
||||
String term = req.queryParams("query");
|
||||
if (null == term) {
|
||||
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
|
||||
return "";
|
||||
}
|
||||
|
||||
return wikiSearchResultRenderer.render(
|
||||
Map.of("query", term,
|
||||
"results",
|
||||
dictionaryService.findEncyclopediaPages(term))
|
||||
);
|
||||
}
|
||||
|
||||
private Object evalExpression(Response rsp, String value) {
|
||||
try {
|
||||
var val = mathParser.evalFormatted(value);
|
||||
|
@ -24,10 +24,6 @@ public class AssistantClient extends AbstractDynamicClient {
|
||||
return super.get(ctx,"/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class);
|
||||
}
|
||||
|
||||
public Observable<WikiArticles> encyclopediaLookup(Context ctx, String word) {
|
||||
return super.get(ctx,"/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public Observable<List<String>> spellCheck(Context ctx, String word) {
|
||||
return (Observable<List<String>>) (Object) super.get(ctx,"/spell-check/" + UrlEncoded.encodeString(word), List.class);
|
||||
|
@ -43,142 +43,6 @@ public class DictionaryService {
|
||||
return response;
|
||||
}
|
||||
|
||||
public WikiArticles encyclopedia(String term) {
|
||||
WikiArticles response = new WikiArticles();
|
||||
response.entries = new ArrayList<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
var stmt = connection.prepareStatement("SELECT DISTINCT(NAME_LOWER) FROM REF_WIKI_TITLE WHERE NAME_LOWER=?");
|
||||
stmt.setString(1, term);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
response.entries.add(capitalizeWikiString(rsp.getString(1)));
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to fetch articles", ex);
|
||||
return new WikiArticles();
|
||||
}
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
public Optional<String> resolveEncylopediaRedirect(String term) {
|
||||
final List<String> matches = new ArrayList<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
if (term.equals(rsp.getString(1))
|
||||
|| rsp.getString(2) == null) {
|
||||
return Optional.ofNullable(rsp.getString(2));
|
||||
} else {
|
||||
matches.add(rsp.getString(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
if (!matches.isEmpty()) {
|
||||
return Optional.of(matches.get(0));
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public Optional<WikiSearchResult> findEncyclopediaPageDirect(String term) {
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term.replace(' ', '_'));
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
return Optional.of(new WikiSearchResult(name, null));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public List<WikiSearchResult> findEncyclopediaPages(String term) {
|
||||
final List<WikiSearchResult> directMatches = new ArrayList<>();
|
||||
final Set<WikiSearchResult> directSearchMatches = new HashSet<>();
|
||||
final Set<WikiSearchResult> indirectMatches = new HashSet<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term.replace(' ', '_'));
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
directMatches.add(new WikiSearchResult(name, null));
|
||||
} else {
|
||||
indirectMatches.add(new WikiSearchResult(name, refName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER LIKE ? LIMIT 10")) {
|
||||
stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%");
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
directSearchMatches.add(new WikiSearchResult(name, null));
|
||||
} else {
|
||||
indirectMatches.add(new WikiSearchResult(name, refName));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
directMatches.forEach(indirectMatches::remove);
|
||||
indirectMatches.removeAll(directSearchMatches);
|
||||
directMatches.forEach(directSearchMatches::remove);
|
||||
directMatches.addAll(indirectMatches);
|
||||
directMatches.addAll(directSearchMatches);
|
||||
return directMatches;
|
||||
}
|
||||
|
||||
private String capitalizeWikiString(String string) {
|
||||
if (string.contains("_")) {
|
||||
return Arrays.stream(string.split("_")).map(this::capitalizeWikiString).collect(Collectors.joining("_"));
|
||||
}
|
||||
if (string.length() < 2) {
|
||||
return string.toUpperCase();
|
||||
}
|
||||
return Character.toUpperCase(string.charAt(0)) + string.substring(1).toLowerCase();
|
||||
}
|
||||
|
||||
public List<String> spellCheck(String word) {
|
||||
return spellChecker.correct(word);
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.search.results.SearchResultValuator;
|
||||
import nu.marginalia.wmsa.edge.search.results.model.AccumulatedQueryResults;
|
||||
import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator;
|
||||
import nu.marginalia.wmsa.edge.search.results.UrlDeduplicator;
|
||||
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
@ -33,6 +34,7 @@ public class EdgeSearchOperator {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(EdgeSearchOperator.class);
|
||||
private final AssistantClient assistantClient;
|
||||
private final EncyclopediaClient encyclopediaClient;
|
||||
private final EdgeDataStoreDao edgeDataStoreDao;
|
||||
private final EdgeIndexClient indexClient;
|
||||
private final QueryFactory queryFactory;
|
||||
@ -42,6 +44,7 @@ public class EdgeSearchOperator {
|
||||
|
||||
@Inject
|
||||
public EdgeSearchOperator(AssistantClient assistantClient,
|
||||
EncyclopediaClient encyclopediaClient,
|
||||
EdgeDataStoreDao edgeDataStoreDao,
|
||||
EdgeIndexClient indexClient,
|
||||
QueryFactory queryFactory,
|
||||
@ -50,6 +53,7 @@ public class EdgeSearchOperator {
|
||||
) {
|
||||
|
||||
this.assistantClient = assistantClient;
|
||||
this.encyclopediaClient = encyclopediaClient;
|
||||
this.edgeDataStoreDao = edgeDataStoreDao;
|
||||
this.indexClient = indexClient;
|
||||
this.queryFactory = queryFactory;
|
||||
@ -220,7 +224,7 @@ public class EdgeSearchOperator {
|
||||
|
||||
@NotNull
|
||||
private Observable<WikiArticles> getWikiArticle(Context ctx, String humanQuery) {
|
||||
return assistantClient
|
||||
return encyclopediaClient
|
||||
.encyclopediaLookup(ctx,
|
||||
humanQuery.replaceAll("\\s+", "_")
|
||||
.replaceAll("\"", "")
|
||||
|
@ -1,384 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.archive.archiver.ArchiveExtractor;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlStandardExtractor;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter;
|
||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.model.*;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.UNKNOWN;
|
||||
|
||||
public class ConverterMain {
|
||||
static final LinkedBlockingQueue<EdgeRawPageContents> processQueue = new LinkedBlockingQueue<>(20);
|
||||
static final LinkedBlockingQueue<UploadJob> uploadQueue = new LinkedBlockingQueue<>(2);
|
||||
|
||||
static final TObjectIntHashMap<String> urlToIdMap = new TObjectIntHashMap<>(50_000_000, 0.5f, -1);
|
||||
static final TObjectIntHashMap<String> domainToIdMap = new TObjectIntHashMap<>(5_000_000, 0.5f, -1);
|
||||
static final TIntObjectHashMap<String> idToDomainMap = new TIntObjectHashMap<>(5_000_000, 0.5f, -1);
|
||||
static HikariDataSource conn;
|
||||
|
||||
private static SearchIndexWriterImpl indexWriter;
|
||||
private static DictionaryWriter dictionaryWriter;
|
||||
|
||||
@AllArgsConstructor
|
||||
static class UploadJob {
|
||||
EdgeId<EdgeDomain> domainId;
|
||||
EdgeId<EdgeUrl> urlId;
|
||||
EdgePageWordSet words;
|
||||
int wordCount;
|
||||
}
|
||||
|
||||
static volatile boolean running = true;
|
||||
|
||||
public static void main(String... args) {
|
||||
org.mariadb.jdbc.Driver driver = new Driver();
|
||||
|
||||
dictionaryWriter = new DictionaryWriter(new File(args[0]), 1L << 30, true);
|
||||
indexWriter = new SearchIndexWriterImpl(dictionaryWriter, new File(args[1]));
|
||||
|
||||
new Thread(ConverterMain::uploadThread, "Uploader").start();
|
||||
|
||||
for (int i = 0; i < 24; i++) {
|
||||
new Thread(ConverterMain::processorThread, "Processor-"+i).start();
|
||||
}
|
||||
|
||||
conn = new DatabaseModule().provideConnection();
|
||||
|
||||
System.out.println("Loading URLs and domains");
|
||||
try (var c = conn.getConnection();
|
||||
var getUrlsStmt = c.prepareStatement("SELECT EC_URL.ID, DOMAIN_ID, PROTO, URL FROM EC_URL WHERE VISITED");
|
||||
var getDomainsStmt = c.prepareStatement("SELECT ID, URL_PART FROM EC_DOMAIN WHERE INDEXED>0")
|
||||
) {
|
||||
getUrlsStmt.setFetchSize(10_000);
|
||||
getDomainsStmt.setFetchSize(10_000);
|
||||
|
||||
System.out.println("Fetch domains");
|
||||
var domainRsp = getDomainsStmt.executeQuery();
|
||||
while (domainRsp.next()) {
|
||||
domainToIdMap.put(domainRsp.getString(2), domainRsp.getInt(1));
|
||||
idToDomainMap.put(domainRsp.getInt(1), domainRsp.getString(2));
|
||||
}
|
||||
|
||||
System.out.println("Fetch URLs");
|
||||
var urlRsp = getUrlsStmt.executeQuery();
|
||||
while (urlRsp.next()) {
|
||||
String urlStr = urlRsp.getString(3) + "://" + idToDomainMap.get(urlRsp.getInt(2)) + urlRsp.getString(4);
|
||||
urlToIdMap.put(urlStr, urlRsp.getInt(1));
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
|
||||
// new Thread(ConverterMain::uploadThread, "Uploader").start();
|
||||
//
|
||||
// for (int i = 0; i < 24; i++) {
|
||||
// new Thread(ConverterMain::processorThread, "Processor-"+i).start();
|
||||
// }
|
||||
|
||||
System.out.println("Loaded URLs and domains");
|
||||
|
||||
new ArchiveExtractor(Path.of(args[2])).forEach(
|
||||
page -> {
|
||||
if (page.contentType.contentType.startsWith("application/xhtml")
|
||||
|| page.contentType.contentType.startsWith("text/html")) {
|
||||
try {
|
||||
int domainId = domainToIdMap.get(page.url.domain.toString());
|
||||
if (domainId >= 0 && page.redirectUrl == null) {
|
||||
int urlId = urlToIdMap.get(page.url.toString());
|
||||
int dataHash = page.data.hashCode();
|
||||
try (var c = conn.getConnection();
|
||||
var updateHash = c.prepareStatement("UPDATE EC_URL SET DATA_HASH=? WHERE ID=?"))
|
||||
{
|
||||
updateHash.setInt(1, dataHash);
|
||||
updateHash.setInt(2, urlId);
|
||||
updateHash.executeUpdate();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
running = false;
|
||||
}
|
||||
|
||||
static final LanguageModels lm = new LanguageModels(
|
||||
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
|
||||
Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"),
|
||||
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
|
||||
Path.of("/var/lib/wmsa/model/English.RDR"),
|
||||
Path.of("/var/lib/wmsa/model/English.DICT"),
|
||||
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
|
||||
);
|
||||
static final NGramDict dict = new NGramDict(lm);
|
||||
|
||||
private static final LanguageFilter languageFilter = new LanguageFilter();
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
public static void processorThread() {
|
||||
SentenceExtractor newSe = new SentenceExtractor(lm);
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||
|
||||
try {
|
||||
while (running || !processQueue.isEmpty()) {
|
||||
var job = processQueue.take();
|
||||
if (job.data.length() > 512*1024) {
|
||||
System.out.println(job.url + " too big, skipping");
|
||||
}
|
||||
|
||||
var parsed = Jsoup.parse(job.data);
|
||||
var text = parsed.text();
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(text)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var dld = newSe.extractSentences(parsed.clone());
|
||||
var keywords = documentKeywordExtractor.extractKeywords(dld);
|
||||
int wc = dld.totalNumWords();
|
||||
|
||||
if (wc > 100) {
|
||||
double languageAgreement = languageFilter.dictionaryAgreement(dld);
|
||||
if (languageAgreement < 0.05) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
EdgeHtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(parsed.documentType());
|
||||
if (UNKNOWN.equals(htmlStandard)) {
|
||||
htmlStandard = HtmlStandardExtractor.sniffHtmlStandard(parsed);
|
||||
}
|
||||
|
||||
int scriptTags = getScriptPenalty(parsed);
|
||||
var featureSet = getFeatureSet(parsed, scriptTags, job.hasCookies);
|
||||
addTags(keywords, htmlStandard, job.url, featureSet);
|
||||
|
||||
extractLinkWords(keywords, job.getUrl(), parsed);
|
||||
|
||||
uploadQueue.put(new UploadJob(
|
||||
new EdgeId<>(domainToIdMap.get(job.url.domain.toString())),
|
||||
new EdgeId<>(urlToIdMap.get(job.url.toString())),
|
||||
keywords,
|
||||
0
|
||||
));
|
||||
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Map<EdgeUrl, Set<String>> extractLinkWords(EdgePageWordSet keywords, EdgeUrl pageUrl, Document parsed) {
|
||||
|
||||
List<Pair<EdgeUrl, String>> urls = new ArrayList<>();
|
||||
Set<String> linkKeywords = new HashSet<>();
|
||||
Map<EdgeUrl, Set<String>> linkTextWords = new ConcurrentHashMap<>();
|
||||
|
||||
for (var tag : parsed.getElementsByTag("a")) {
|
||||
if (!tag.hasAttr("href")) {
|
||||
continue;
|
||||
}
|
||||
if (urls.size() > 100) {
|
||||
break;
|
||||
}
|
||||
|
||||
var linkOpt = linkParser.parseLink(pageUrl, tag);
|
||||
if (linkOpt.isEmpty())
|
||||
continue;
|
||||
|
||||
var link = linkOpt.get();
|
||||
|
||||
urls.add(Pair.of(link, tag.text()));
|
||||
|
||||
if (!Objects.equals(link.domain.domain, pageUrl.domain.domain)
|
||||
&& linkKeywords.size() <= 25)
|
||||
{
|
||||
linkKeywords.add("links:" + link.domain.domain);
|
||||
}
|
||||
//
|
||||
// Set<String> words = new HashSet<>();
|
||||
//
|
||||
// for (var sent : sentenceExtractor.extractSentencesFromString(tag.text())) {
|
||||
// for (var keyword : keywordExtractor.getWordsFromSentence(sent)) {
|
||||
// words.add(sent.constructWordFromSpan(keyword));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// linkTextWords.compute(link, (k, set) -> {
|
||||
// if (set == null) return words;
|
||||
// else { set.addAll(words); return set; }
|
||||
// });
|
||||
|
||||
}
|
||||
|
||||
keywords.get(IndexBlock.Meta).addAll(linkKeywords);
|
||||
|
||||
if (WordPatterns.wordQualitiesPredicate.test(pageUrl.domain.domain.toLowerCase())) {
|
||||
keywords.get(IndexBlock.Link).addJust(pageUrl.domain.domain.toLowerCase());
|
||||
}
|
||||
|
||||
return linkTextWords;
|
||||
}
|
||||
|
||||
private static int getScriptPenalty(Document parsed) {
|
||||
var scriptTags = parsed.getElementsByTag("script");
|
||||
String scriptText = scriptTags.html();
|
||||
int badScript = 0;
|
||||
if (scriptText.contains(".createElement(")) {
|
||||
badScript = 1;
|
||||
}
|
||||
return scriptTags.size() + badScript + (scriptText.length())/1000;
|
||||
}
|
||||
|
||||
static final List<String> trackers = List.of("adform.net",
|
||||
"connect.facebook",
|
||||
"googletagmanager.com",
|
||||
"googlesyndication.com",
|
||||
"google.com",
|
||||
"twitter.com",
|
||||
"smartadserver.com",
|
||||
"doubleclick.com",
|
||||
"2mdn.com",
|
||||
"dmtry.com",
|
||||
"bing.com",
|
||||
"msn.com",
|
||||
"amazon-adsystem.com",
|
||||
"alexametrics.com",
|
||||
"rubiconproject.com",
|
||||
"chango.com",
|
||||
"d5nxst8fruw4z.cloudfront.net",
|
||||
"d31qbv1cthcecs.cloudfront.net",
|
||||
"linkedin.com");
|
||||
|
||||
private static Set<HtmlFeature> getFeatureSet(Document parsed, int scriptTags, boolean cookies) {
|
||||
Set<HtmlFeature> features = new HashSet<>();
|
||||
|
||||
if (scriptTags > 0) {
|
||||
features.add(HtmlFeature.JS);
|
||||
}
|
||||
if (!parsed.getElementsByTag("object").isEmpty()
|
||||
|| !parsed.getElementsByTag("audio").isEmpty()
|
||||
|| !parsed.getElementsByTag("video").isEmpty()) {
|
||||
features.add(HtmlFeature.MEDIA);
|
||||
}
|
||||
if (parsed.getElementsByTag("script").stream()
|
||||
.filter(tag -> tag.attr("src") != null)
|
||||
.anyMatch(tag -> trackers.stream().anyMatch(tracker -> tag.attr("src").contains(tracker)))) {
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
}
|
||||
if (parsed.getElementsByTag("script").html().contains("google-analytics.com")) {
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
}
|
||||
if (parsed.getElementsByTag("a").stream().map(e -> e.attr("href"))
|
||||
.filter(Objects::nonNull)
|
||||
.map(String::toLowerCase)
|
||||
.anyMatch(href ->
|
||||
href.contains("amzn.to/") || href.contains("amazon.com/"))) {
|
||||
features.add(HtmlFeature.AFFILIATE_LINK);
|
||||
}
|
||||
if (cookies) {
|
||||
features.add(HtmlFeature.COOKIES);
|
||||
}
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
private static void addTags(EdgePageWordSet wordSet, EdgeHtmlStandard htmlStandard, EdgeUrl url, Set<HtmlFeature> features) {
|
||||
List<String> tagWords = new ArrayList<>();
|
||||
tagWords.add("format:"+htmlStandard.toString().toLowerCase());
|
||||
tagWords.add("site:"+url.domain.toString().toLowerCase());
|
||||
tagWords.add("proto:"+url.proto.toLowerCase());
|
||||
tagWords.add("js:" + Boolean.toString(features.contains(HtmlFeature.JS)).toLowerCase());
|
||||
if (features.contains(HtmlFeature.MEDIA)) {
|
||||
tagWords.add("special:media");
|
||||
}
|
||||
if (features.contains(HtmlFeature.TRACKING)) {
|
||||
tagWords.add("special:tracking");
|
||||
}
|
||||
if (features.contains(HtmlFeature.AFFILIATE_LINK)) {
|
||||
tagWords.add("special:affiliate");
|
||||
}
|
||||
if (features.contains(HtmlFeature.COOKIES)) {
|
||||
tagWords.add("special:cookies");
|
||||
}
|
||||
wordSet.append(IndexBlock.Meta, tagWords);
|
||||
wordSet.append(IndexBlock.Words, tagWords);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public static void uploadThread() {
|
||||
|
||||
while (running || !processQueue.isEmpty() || !uploadQueue.isEmpty()) {
|
||||
var data = uploadQueue.take();
|
||||
|
||||
if (!data.words.isEmpty()) {
|
||||
for (var words : data.words.values()) {
|
||||
if (!words.getWords().isEmpty()) {
|
||||
if (words.size() < 1000) {
|
||||
indexWriter.put(data.domainId, data.urlId, words.block, words.words);
|
||||
} else {
|
||||
chunks(words.words, 1000).forEach(chunk -> {
|
||||
indexWriter.put(data.domainId, data.urlId, words.block, chunk);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Closing");
|
||||
dictionaryWriter.commitToDisk();
|
||||
indexWriter.forceWrite();
|
||||
dictionaryWriter.close();
|
||||
indexWriter.close();
|
||||
System.out.println("Done");
|
||||
}
|
||||
|
||||
private static <T> List<List<T>> chunks(Collection<T> coll, int size) {
|
||||
List<List<T>> ret = new ArrayList<>();
|
||||
List<T> data = List.copyOf(coll);
|
||||
|
||||
for (int i = 0; i < data.size(); i+=size) {
|
||||
ret.add(data.subList(i, Math.min(data.size(), i+size)));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
@ -1,142 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.archive.archiver.ArchiveExtractor;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents;
|
||||
import opennlp.tools.stemmer.PorterStemmer;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
public class TermFrequencyCounterMain {
|
||||
|
||||
static final LinkedBlockingQueue<EdgeRawPageContents> processQueue = new LinkedBlockingQueue<>(20);
|
||||
|
||||
public static final String OUTPUT_FILE = "/var/lib/wmsa/archive/tfreq-2022-04-04.bin";
|
||||
public static final String ARCHIVE_PATH = "/var/lib/wmsa/archive/webpage"; // "/mnt/storage/wmsa/archive/webpage/"
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) {
|
||||
|
||||
List<Thread> pt = new ArrayList<>();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
pt.add(new Thread(TermFrequencyCounterMain::processorThread));
|
||||
}
|
||||
pt.forEach(Thread::start);
|
||||
|
||||
AtomicLong docsTotal = new AtomicLong();
|
||||
new ArchiveExtractor(Path.of(ARCHIVE_PATH)).forEach(
|
||||
page -> {
|
||||
if (page.contentType.contentType.contains("html")
|
||||
&& page.isAfter("2022-03-15T")) {
|
||||
try {
|
||||
long dt = docsTotal.incrementAndGet();
|
||||
if (dt == 0) {
|
||||
System.out.println(docsTotal.get() + " - " + termFreq.size());
|
||||
}
|
||||
if ((dt % 5) != 0) {
|
||||
processQueue.put(page);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
});
|
||||
running = false;
|
||||
|
||||
|
||||
System.out.println("Waiting for wrap-up");
|
||||
|
||||
Thread.sleep(36000);
|
||||
|
||||
for (Thread thread : pt) {
|
||||
thread.interrupt();
|
||||
}
|
||||
for (Thread thread : pt) {
|
||||
thread.join();
|
||||
}
|
||||
System.out.println("Total documents = " + docsTotal.get());
|
||||
|
||||
System.out.println("Writing Frequencies");
|
||||
|
||||
try (var dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(OUTPUT_FILE)))
|
||||
) {
|
||||
synchronized (termFreq) {
|
||||
for (var entry : termFreq.entrySet()) {
|
||||
|
||||
if (entry.getValue() > 5) {
|
||||
dos.writeLong(entry.getKey());
|
||||
dos.writeLong(entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
System.out.println("All done!");
|
||||
}
|
||||
|
||||
public static final ConcurrentHashMap<Long, Integer> termFreq = new ConcurrentHashMap<>();
|
||||
|
||||
public static final LanguageModels lm = new LanguageModels(
|
||||
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
|
||||
Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"),
|
||||
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
|
||||
Path.of("/var/lib/wmsa/model/English.RDR"),
|
||||
Path.of("/var/lib/wmsa/model/English.DICT"),
|
||||
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
|
||||
);
|
||||
public static volatile boolean running = true;
|
||||
|
||||
public static void processorThread() {
|
||||
var ke = new KeywordExtractor();
|
||||
var se = new SentenceExtractor(lm);
|
||||
var ps = new PorterStemmer();
|
||||
try {
|
||||
TLongHashSet words = new TLongHashSet(10000);
|
||||
while (running || !processQueue.isEmpty()) {
|
||||
var job = processQueue.take();
|
||||
var sentence = se.extractSentences(Jsoup.parse(job.data));
|
||||
|
||||
for (var sent : sentence.sentences) {
|
||||
var keywords = ke.getKeywordsFromSentence(sent);
|
||||
for (int i = 0; i < keywords.length; i++) {
|
||||
if (keywords[i].size() > 1) {
|
||||
words.add(NGramDict.longHash(sent.constructStemmedWordFromSpan(keywords[i]).getBytes()));
|
||||
}
|
||||
}
|
||||
|
||||
for (String word : sent.wordsLowerCase) {
|
||||
words.add(NGramDict.longHash(ps.stem(word).getBytes()));
|
||||
}
|
||||
|
||||
words.forEach(l -> {
|
||||
termFreq.merge(l, 1, Integer::sum);
|
||||
return true;
|
||||
});
|
||||
words.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -2,10 +2,10 @@ package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
import org.openzim.ZIMTypes.ZIMReader;
|
||||
@ -25,7 +25,7 @@ public class ZimConverterMain {
|
||||
static final LinkedBlockingQueue<ConversionJob> jobQueue = new LinkedBlockingQueue<>(100);
|
||||
static final LinkedBlockingQueue<String> analysisQueue = new LinkedBlockingQueue<>(100);
|
||||
static boolean hasData = true;
|
||||
static final ArchiveClient archiveClient = new ArchiveClient();
|
||||
static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient();
|
||||
static NGramDict dict = new NGramDict(new LanguageModels(
|
||||
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
|
||||
Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"),
|
||||
@ -60,7 +60,7 @@ public class ZimConverterMain {
|
||||
// convertJust("Plotinus");
|
||||
// convertJust("C++");
|
||||
convertAll(args);
|
||||
archiveClient.close();
|
||||
encyclopediaClient.close();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@ -108,7 +108,7 @@ public class ZimConverterMain {
|
||||
}
|
||||
|
||||
private static void convertAll(String[] args) throws IOException {
|
||||
archiveClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0]));
|
||||
encyclopediaClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0]));
|
||||
var zr = new ZIMReader(new ZIMFile(args[1]));
|
||||
// var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
|
||||
|
||||
@ -142,7 +142,7 @@ public class ZimConverterMain {
|
||||
}, p -> true);
|
||||
|
||||
hasData = false;
|
||||
archiveClient.close();
|
||||
encyclopediaClient.close();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
|
@ -0,0 +1,34 @@
|
||||
package nu.marginalia.wmsa.encyclopedia;
|
||||
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
||||
import nu.marginalia.wmsa.client.HttpStatusCode;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
|
||||
import okhttp3.MediaType;
|
||||
import org.eclipse.jetty.util.UrlEncoded;
|
||||
|
||||
import javax.annotation.CheckReturnValue;
|
||||
|
||||
public class EncyclopediaClient extends AbstractDynamicClient {
|
||||
public EncyclopediaClient() {
|
||||
super(ServiceDescriptor.ENCYCLOPEDIA);
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<HttpStatusCode> submitWiki(Context ctx, String url, String data) {
|
||||
return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8"));
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<Boolean> hasWiki(Context ctx, String url) {
|
||||
return super.get(ctx, "/wiki/has?url="+ UrlEncoded.encodeString(url), Boolean.class);
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<WikiArticles> encyclopediaLookup(Context ctx, String word) {
|
||||
return super.get(ctx,"/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,160 @@
|
||||
package nu.marginalia.wmsa.encyclopedia;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiSearchResult;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class EncyclopediaDao {
|
||||
|
||||
private HikariDataSource dataSource;
|
||||
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaDao.class);
|
||||
|
||||
@Inject
|
||||
public EncyclopediaDao(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public WikiArticles encyclopedia(String term) {
|
||||
WikiArticles response = new WikiArticles();
|
||||
response.entries = new ArrayList<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
var stmt = connection.prepareStatement("SELECT DISTINCT(NAME_LOWER) FROM REF_WIKI_TITLE WHERE NAME_LOWER=?");
|
||||
stmt.setString(1, term);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
response.entries.add(capitalizeWikiString(rsp.getString(1)));
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to fetch articles", ex);
|
||||
return new WikiArticles();
|
||||
}
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
public Optional<String> resolveEncylopediaRedirect(String term) {
|
||||
final List<String> matches = new ArrayList<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
if (term.equals(rsp.getString(1))
|
||||
|| rsp.getString(2) == null) {
|
||||
return Optional.ofNullable(rsp.getString(2));
|
||||
} else {
|
||||
matches.add(rsp.getString(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
if (!matches.isEmpty()) {
|
||||
return Optional.of(matches.get(0));
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public Optional<WikiSearchResult> findEncyclopediaPageDirect(String term) {
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term.replace(' ', '_'));
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
return Optional.of(new WikiSearchResult(name, null));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public List<WikiSearchResult> findEncyclopediaPages(String term) {
|
||||
final List<WikiSearchResult> directMatches = new ArrayList<>();
|
||||
final Set<WikiSearchResult> directSearchMatches = new HashSet<>();
|
||||
final Set<WikiSearchResult> indirectMatches = new HashSet<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) {
|
||||
stmt.setString(1, term.replace(' ', '_'));
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
directMatches.add(new WikiSearchResult(name, null));
|
||||
} else {
|
||||
indirectMatches.add(new WikiSearchResult(name, refName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER LIKE ? LIMIT 10")) {
|
||||
stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%");
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
String name = rsp.getString(1);
|
||||
String refName = rsp.getString(2);
|
||||
|
||||
if (refName == null) {
|
||||
directSearchMatches.add(new WikiSearchResult(name, null));
|
||||
} else {
|
||||
indirectMatches.add(new WikiSearchResult(name, refName));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
directMatches.forEach(indirectMatches::remove);
|
||||
indirectMatches.removeAll(directSearchMatches);
|
||||
directMatches.forEach(directSearchMatches::remove);
|
||||
directMatches.addAll(indirectMatches);
|
||||
directMatches.addAll(directSearchMatches);
|
||||
return directMatches;
|
||||
}
|
||||
|
||||
private String capitalizeWikiString(String string) {
|
||||
if (string.contains("_")) {
|
||||
return Arrays.stream(string.split("_")).map(this::capitalizeWikiString).collect(Collectors.joining("_"));
|
||||
}
|
||||
if (string.length() < 2) {
|
||||
return string.toUpperCase();
|
||||
}
|
||||
return Character.toUpperCase(string.charAt(0)) + string.substring(1).toLowerCase();
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
package nu.marginalia.wmsa.encyclopedia;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.wmsa.configuration.MainClass;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
|
||||
|
||||
public class EncyclopediaMain extends MainClass {
|
||||
private final EncyclopediaService service;
|
||||
|
||||
public static void main(String... args) {
|
||||
init(ServiceDescriptor.ENCYCLOPEDIA, args);
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new ConfigurationModule());
|
||||
injector.getInstance(EncyclopediaMain.class);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public EncyclopediaMain(EncyclopediaService service) {
|
||||
this.service = service;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,202 @@
|
||||
package nu.marginalia.wmsa.encyclopedia;
|
||||
|
||||
import com.google.inject.name.Named;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
public class EncyclopediaService extends Service {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaService.class);
|
||||
private final MustacheRenderer<String> wikiErrorPageRenderer;
|
||||
private final MustacheRenderer<Object> wikiSearchResultRenderer;
|
||||
private Path wikiPath;
|
||||
private EncyclopediaDao encyclopediaDao;
|
||||
|
||||
public EncyclopediaService(@Named("service-host") String ip,
|
||||
@Named("service-port") Integer port,
|
||||
@Named("wiki-path") Path wikiPath,
|
||||
EncyclopediaDao encyclopediaDao,
|
||||
RendererFactory rendererFactory,
|
||||
Initialization initialization,
|
||||
MetricsServer metricsServer)
|
||||
throws IOException {
|
||||
super(ip, port, initialization, metricsServer);
|
||||
this.wikiPath = wikiPath;
|
||||
this.encyclopediaDao = encyclopediaDao;
|
||||
|
||||
if (rendererFactory != null) {
|
||||
wikiErrorPageRenderer = rendererFactory.renderer("encyclopedia/wiki-error");
|
||||
wikiSearchResultRenderer = rendererFactory.renderer("encyclopedia/wiki-search");
|
||||
}
|
||||
else {
|
||||
wikiErrorPageRenderer = null;
|
||||
wikiSearchResultRenderer = null;
|
||||
}
|
||||
|
||||
|
||||
Spark.get("/public/wiki/*", this::getWikiPage);
|
||||
Spark.get("/public/wiki-search", this::searchWikiPage);
|
||||
|
||||
Spark.get("/wiki/has", this::pathWikiHas);
|
||||
Spark.post("/wiki/submit", this::pathWikiSubmit);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Object getWikiPage(Request req, Response rsp) {
|
||||
final String[] splats = req.splat();
|
||||
if (splats.length == 0)
|
||||
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
|
||||
|
||||
|
||||
final String name = splats[0];
|
||||
|
||||
String pageName = encyclopediaDao.resolveEncylopediaRedirect(name).orElse(name);
|
||||
|
||||
logger.info("Resolved {} -> {}", name, pageName);
|
||||
|
||||
return wikiGet(pageName)
|
||||
.or(() -> resolveWikiPageNameWrongCase(name))
|
||||
.orElseGet(() -> renderSearchPage(name));
|
||||
}
|
||||
|
||||
private Optional<String> resolveWikiPageNameWrongCase(String name) {
|
||||
var rsp = encyclopediaDao.findEncyclopediaPageDirect(name);
|
||||
|
||||
if (rsp.isEmpty()) {
|
||||
return Optional.of(renderSearchPage(name));
|
||||
}
|
||||
|
||||
name = rsp.get().getInternalName();
|
||||
return wikiGet(name);
|
||||
}
|
||||
|
||||
private String renderSearchPage(String s) {
|
||||
return wikiSearchResultRenderer.render(
|
||||
Map.of("query", s,
|
||||
"error", "true",
|
||||
"results", encyclopediaDao.findEncyclopediaPages(s)));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object searchWikiPage(Request req, Response rsp) {
|
||||
final var ctx = Context.fromRequest(req);
|
||||
|
||||
String term = req.queryParams("query");
|
||||
if (null == term) {
|
||||
rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html");
|
||||
return "";
|
||||
}
|
||||
|
||||
return wikiSearchResultRenderer.render(
|
||||
Map.of("query", term,
|
||||
"results",
|
||||
encyclopediaDao.findEncyclopediaPages(term))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
private Path getWikiFilename(Path base, String url) {
|
||||
Path p = base;
|
||||
|
||||
int urlHash = url.hashCode();
|
||||
|
||||
p = p.resolve(Integer.toString(urlHash & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>8) & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>16) & 0xFF));
|
||||
p = p.resolve(Integer.toString((urlHash>>>24) & 0xFF));
|
||||
|
||||
String fileName = url.chars()
|
||||
.mapToObj(this::encodeUrlChar)
|
||||
.collect(Collectors.joining());
|
||||
|
||||
if (fileName.length() > 128) {
|
||||
fileName = fileName.substring(0, 128) + (((long)urlHash)&0xFFFFFFFFL);
|
||||
}
|
||||
|
||||
return p.resolve(fileName + ".gz");
|
||||
}
|
||||
|
||||
|
||||
private String encodeUrlChar(int i) {
|
||||
if (i >= 'a' && i <= 'z') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i >= 'A' && i <= 'Z') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i >= '0' && i <= '9') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
if (i == '.') {
|
||||
return Character.toString(i);
|
||||
}
|
||||
else {
|
||||
return String.format("%%%2X", i);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object pathWikiHas(Request request, Response response) {
|
||||
return Files.exists(getWikiFilename(wikiPath, request.queryParams("url")));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Optional<String> wikiGet(String name) {
|
||||
|
||||
var filename = getWikiFilename(wikiPath, name);
|
||||
|
||||
if (Files.exists(filename)) {
|
||||
try (var stream = new GZIPInputStream(new FileInputStream(filename.toFile()))) {
|
||||
return Optional.of(new String(stream.readAllBytes()));
|
||||
}
|
||||
} else {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Object pathWikiSubmit(Request request, Response response) {
|
||||
byte[] data = request.bodyAsBytes();
|
||||
|
||||
String wikiUrl = request.queryParams("url");
|
||||
Path filename = getWikiFilename(wikiPath, wikiUrl);
|
||||
|
||||
Files.createDirectories(filename.getParent());
|
||||
|
||||
System.out.println(new String(data));
|
||||
logger.debug("Writing {} to {}", wikiUrl, filename);
|
||||
|
||||
try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) {
|
||||
gos.write(data);
|
||||
gos.flush();
|
||||
}
|
||||
|
||||
return "ok";
|
||||
|
||||
}
|
||||
}
|
@ -50,8 +50,6 @@ class ServiceTest {
|
||||
new DictionaryService(dataSource, new SpellChecker()),
|
||||
new MathParser(),
|
||||
new Units(new MathParser()),
|
||||
null,
|
||||
null,
|
||||
new ScreenshotService(null), null);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
|
@ -1,72 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.archive;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.edge.archive.archiver.Archiver;
|
||||
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||
import spark.Spark;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static nu.marginalia.util.TestUtil.getPort;
|
||||
import static nu.marginalia.util.test.TestUtil.clearTempDir;
|
||||
|
||||
@Execution(ExecutionMode.SAME_THREAD)
|
||||
public class ArchiveTest {
|
||||
static EdgeArchiveService service;
|
||||
|
||||
static final int testPort = getPort();
|
||||
private static Path tempPath;
|
||||
private static Path tempPath2;
|
||||
private static ArchiveClient archiveClient;
|
||||
private static Archiver archiver;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpClass() throws IOException {
|
||||
Spark.port(testPort);
|
||||
System.setProperty("service-name", "edge-archive");
|
||||
archiveClient = new ArchiveClient();
|
||||
archiveClient.setServiceRoute("127.0.0.1", testPort);
|
||||
|
||||
tempPath = Files.createTempDirectory("archiveTest");
|
||||
tempPath2 = Files.createTempDirectory("wikiTest");
|
||||
|
||||
archiver = new Archiver(tempPath, 10);
|
||||
service = new EdgeArchiveService("127.0.0.1", testPort,
|
||||
tempPath,
|
||||
archiver,
|
||||
new Initialization(), null);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDown() throws Exception {
|
||||
archiver.close();
|
||||
archiveClient.close();
|
||||
clearTempDir(tempPath);
|
||||
clearTempDir(tempPath2);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Test
|
||||
public void testWiki() {
|
||||
var url = "Plato_(Disambiguation)";
|
||||
|
||||
Assertions.assertFalse(archiveClient.hasWiki(Context.internal(), url).blockingFirst());
|
||||
|
||||
archiveClient.submitWiki(Context.internal(), url, "<h1>Hello</h1>").blockingFirst();
|
||||
Assertions.assertTrue(archiveClient.hasWiki(Context.internal(), url).blockingFirst());
|
||||
Assertions.assertEquals("<h1>Hello</h1>", archiveClient.getWiki(Context.internal(), url).blockingFirst());
|
||||
}
|
||||
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.archive.archiver;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ArchiverTest {
|
||||
|
||||
@Test
|
||||
public void testArchiver() throws Exception {
|
||||
Archiver archiver = new Archiver(Path.of("/tmp/"), 3);
|
||||
archiver.writeData(new ArchivedFile("file1", "Hey".getBytes()));
|
||||
archiver.writeData(new ArchivedFile("file2", "Hey".getBytes()));
|
||||
archiver.writeData(new ArchivedFile("file3", "Hey".getBytes()));
|
||||
archiver.writeData(new ArchivedFile("file4", "Hey".getBytes()));
|
||||
archiver.close();
|
||||
}
|
||||
}
|
@ -60,7 +60,6 @@ class AssistantTest {
|
||||
new DictionaryService(dataSource, new SpellChecker()),
|
||||
new MathParser(),
|
||||
new Units(new MathParser()),
|
||||
null, null,
|
||||
new ScreenshotService(null), null);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
@ -77,12 +76,6 @@ class AssistantTest {
|
||||
Spark.awaitStop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEncyclopedia() {
|
||||
var result = client.encyclopediaLookup(Context.internal(), "plato").blockingFirst();
|
||||
System.out.println(result);
|
||||
assertTrue(result.entries.size() >= 1);
|
||||
}
|
||||
@Test
|
||||
public void testSpellCheck() {
|
||||
var result = client.spellCheck(Context.internal(), "plato").blockingFirst();
|
||||
|
Loading…
Reference in New Issue
Block a user