Merge branch 'master' into converter-optimizations
This commit is contained in:
commit
5ce46a61d4
22
code/features-search/feedlot-client/build.gradle
Normal file
22
code/features-search/feedlot-client/build.gradle
Normal file
@ -0,0 +1,22 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(21))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.gson
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
}
|
20
code/features-search/feedlot-client/readme.md
Normal file
20
code/features-search/feedlot-client/readme.md
Normal file
@ -0,0 +1,20 @@
|
||||
Client for [FeedlotTheFeedBot](https://github.com/MarginaliaSearch/FeedLotTheFeedBot),
|
||||
the RSS/Atom feed fetcher and cache for Marginalia Search.
|
||||
|
||||
This service is external to the Marginalia Search codebase,
|
||||
as it is not a core part of the search engine and has other
|
||||
utilities.
|
||||
|
||||
## Example
|
||||
|
||||
```java
|
||||
|
||||
import java.time.Duration;
|
||||
|
||||
var client = new FeedlotClient("localhost", 8080,
|
||||
gson,
|
||||
Duration.ofMillis(100), // connect timeout
|
||||
Duration.ofMillis(100)); // request timeout
|
||||
|
||||
CompleteableFuture<FeedItems> items = client.getFeedItems("www.marginalia.nu");
|
||||
```
|
@ -0,0 +1,58 @@
|
||||
package nu.marginalia.feedlot;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.feedlot.model.FeedItems;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.time.Duration;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
public class FeedlotClient {
|
||||
private final String feedlotHost;
|
||||
private final int feedlotPort;
|
||||
private final Gson gson;
|
||||
private final HttpClient httpClient;
|
||||
private final Duration requestTimeout;
|
||||
|
||||
public FeedlotClient(String feedlotHost,
|
||||
int feedlotPort,
|
||||
Gson gson,
|
||||
Duration connectTimeout,
|
||||
Duration requestTimeout
|
||||
)
|
||||
{
|
||||
this.feedlotHost = feedlotHost;
|
||||
this.feedlotPort = feedlotPort;
|
||||
this.gson = gson;
|
||||
|
||||
httpClient = HttpClient.newBuilder()
|
||||
.executor(Executors.newVirtualThreadPerTaskExecutor())
|
||||
.connectTimeout(connectTimeout)
|
||||
.build();
|
||||
this.requestTimeout = requestTimeout;
|
||||
}
|
||||
|
||||
public CompletableFuture<FeedItems> getFeedItems(String domainName) {
|
||||
return httpClient.sendAsync(
|
||||
HttpRequest.newBuilder()
|
||||
.uri(URI.create("http://%s:%d/feed/%s".formatted(feedlotHost, feedlotPort, domainName)))
|
||||
.GET()
|
||||
.timeout(requestTimeout)
|
||||
.build(),
|
||||
HttpResponse.BodyHandlers.ofString()
|
||||
).thenApply(HttpResponse::body)
|
||||
.thenApply(this::parseFeedItems);
|
||||
}
|
||||
|
||||
private FeedItems parseFeedItems(String s) {
|
||||
return gson.fromJson(s, FeedItems.class);
|
||||
}
|
||||
|
||||
public void stop() {
|
||||
httpClient.close();
|
||||
}
|
||||
}
|
@ -0,0 +1,17 @@
|
||||
package nu.marginalia.feedlot.model;
|
||||
|
||||
public record FeedItem(String title, String date, String description, String url) {
|
||||
|
||||
public String pubDay() { // Extract the date from an ISO style date string
|
||||
if (date.length() > 10) {
|
||||
return date.substring(0, 10);
|
||||
}
|
||||
return date;
|
||||
}
|
||||
|
||||
public String descriptionSafe() {
|
||||
return description
|
||||
.replace("<", "<")
|
||||
.replace(">", ">");
|
||||
}
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
package nu.marginalia.feedlot.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record FeedItems(String domain, String feedUrl, String updated, List<FeedItem> items) {
|
||||
}
|
@ -40,8 +40,8 @@ public class CrawlerWarcResynchronizer {
|
||||
for (var item : reader) {
|
||||
accept(item);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.info(STR."Failed read full warc file \{tempFile}", e);
|
||||
} catch (Exception e) {
|
||||
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
|
||||
}
|
||||
|
||||
// Second pass, copy records to the new warc file
|
||||
@ -49,8 +49,8 @@ public class CrawlerWarcResynchronizer {
|
||||
for (var item : reader) {
|
||||
recorder.resync(item);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.info(STR."Failed read full warc file \{tempFile}", e);
|
||||
} catch (Exception e) {
|
||||
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -37,7 +37,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
private final Path warcFile;
|
||||
private static final Logger logger = LoggerFactory.getLogger(WarcRecorder.class);
|
||||
|
||||
private final ThreadLocal<byte[]> bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]);
|
||||
private final static ThreadLocal<byte[]> bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]);
|
||||
|
||||
private boolean temporaryFile = false;
|
||||
|
||||
|
@ -4,23 +4,24 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.*;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.io.crawlspec.CrawlSpecRecordParquetFileWriter;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.netpreserve.jwarc.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
@ -37,6 +38,7 @@ class CrawlerRetreiverTest {
|
||||
Path tempFileParquet1;
|
||||
Path tempFileWarc2;
|
||||
Path tempFileParquet2;
|
||||
Path tempFileWarc3;
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||
@ -66,7 +68,11 @@ class CrawlerRetreiverTest {
|
||||
if (tempFileParquet2 != null) {
|
||||
Files.deleteIfExists(tempFileParquet2);
|
||||
}
|
||||
if (tempFileWarc3 != null) {
|
||||
Files.deleteIfExists(tempFileWarc3);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWarcOutput() throws IOException {
|
||||
var specs = CrawlSpecRecord
|
||||
@ -79,11 +85,7 @@ class CrawlerRetreiverTest {
|
||||
try {
|
||||
tempFile = Files.createTempFile("crawling-process", "warc");
|
||||
|
||||
try (var recorder = new WarcRecorder(tempFile)) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||
} catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
doCrawl(tempFile, specs);
|
||||
|
||||
Set<String> requests = new HashSet<>();
|
||||
Set<String> responses = new HashSet<>();
|
||||
@ -112,6 +114,57 @@ class CrawlerRetreiverTest {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Test
|
||||
public void testResync() throws IOException {
|
||||
var specs = CrawlSpecRecord
|
||||
.builder()
|
||||
.crawlDepth(5)
|
||||
.domain("www.marginalia.nu")
|
||||
.urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/"))
|
||||
.build();
|
||||
tempFileWarc1 = Files.createTempFile("crawling-process", "warc");
|
||||
tempFileWarc2 = Files.createTempFile("crawling-process", "warc");
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
|
||||
Set<String> requests = new HashSet<>();
|
||||
Set<String> responses = new HashSet<>();
|
||||
|
||||
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
||||
new EdgeDomain("www.marginalia.nu"),
|
||||
List.of(), 100);
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc2)
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
simulatePartialWrite(tempFileWarc1);
|
||||
|
||||
resync.run(tempFileWarc1);
|
||||
assertTrue(revisitCrawlFrontier.addKnown(new EdgeUrl("https://www.marginalia.nu/misc/debian-laptop-install-log/")));
|
||||
|
||||
try (var reader = new WarcReader(tempFileWarc2)) {
|
||||
reader.forEach(record -> {
|
||||
if (record instanceof WarcRequest req) {
|
||||
requests.add(req.target());
|
||||
System.out.println(req.type() + ":" + req.target());
|
||||
}
|
||||
else if (record instanceof WarcResponse rsp) {
|
||||
responses.add(rsp.target());
|
||||
System.out.println(rsp.type() + ":" + rsp.target());
|
||||
}
|
||||
else {
|
||||
System.out.println(record.type());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
|
||||
assertEquals(requests, responses);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithKnownDomains() throws IOException {
|
||||
var specs = CrawlSpecRecord
|
||||
@ -125,15 +178,9 @@ class CrawlerRetreiverTest {
|
||||
|
||||
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
|
||||
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
||||
new UserAgent("test"), tempFileWarc1, tempFileParquet1);
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
|
||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
||||
while (stream.hasNext()) {
|
||||
@ -177,16 +224,8 @@ class CrawlerRetreiverTest {
|
||||
|
||||
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
|
||||
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
||||
new UserAgent("test"), tempFileWarc1, tempFileParquet1);
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
|
||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
||||
while (stream.hasNext()) {
|
||||
@ -232,46 +271,11 @@ class CrawlerRetreiverTest {
|
||||
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||
tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||
|
||||
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
||||
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
||||
new UserAgent("test"), tempFileWarc1, tempFileParquet1);
|
||||
|
||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
||||
while (stream.hasNext()) {
|
||||
var doc = stream.next();
|
||||
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
|
||||
var stream = CrawledDomainReader.createDataStream(tempFileParquet1);
|
||||
|
||||
System.out.println("---");
|
||||
|
||||
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
||||
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2)) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(),
|
||||
new CrawlDataReference(stream));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
||||
new UserAgent("test"), tempFileWarc2, tempFileParquet2);
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
doCrawlWithReferenceStream(specs,
|
||||
CrawledDomainReader.createDataStream(tempFileParquet1)
|
||||
);
|
||||
convertToParquet(tempFileWarc2, tempFileParquet2);
|
||||
|
||||
try (var reader = new WarcReader(tempFileWarc2)) {
|
||||
WarcXResponseReference.register(reader);
|
||||
@ -304,4 +308,120 @@ class CrawlerRetreiverTest {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private void convertToParquet(Path tempFileWarc2, Path tempFileParquet2) {
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
||||
new UserAgent("test"), tempFileWarc2, tempFileParquet2);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Test
|
||||
public void testRecrawlWithResync() throws IOException {
|
||||
|
||||
var specs = CrawlSpecRecord
|
||||
.builder()
|
||||
.crawlDepth(12)
|
||||
.domain("www.marginalia.nu")
|
||||
.urls(List.of("https://www.marginalia.nu/some-dead-link"))
|
||||
.build();
|
||||
|
||||
|
||||
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||
tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||
tempFileWarc3 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||
|
||||
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
|
||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
||||
while (stream.hasNext()) {
|
||||
var doc = stream.next();
|
||||
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
var stream = CrawledDomainReader.createDataStream(tempFileParquet1);
|
||||
|
||||
System.out.println("---");
|
||||
|
||||
doCrawlWithReferenceStream(specs, stream);
|
||||
|
||||
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
||||
new EdgeDomain("www.marginalia.nu"),
|
||||
List.of(), 100);
|
||||
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc3)
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
simulatePartialWrite(tempFileWarc2);
|
||||
|
||||
resync.run(tempFileWarc2);
|
||||
|
||||
assertTrue(revisitCrawlFrontier.addKnown(new EdgeUrl("https://www.marginalia.nu/")));
|
||||
convertToParquet(tempFileWarc3, tempFileParquet2);
|
||||
|
||||
|
||||
try (var reader = new WarcReader(tempFileWarc3)) {
|
||||
WarcXResponseReference.register(reader);
|
||||
|
||||
reader.forEach(record -> {
|
||||
if (record instanceof WarcResponse rsp) {
|
||||
try {
|
||||
System.out.println(rsp.type() + ":" + rsp.target() + "/" + rsp.http().status());
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
if (record instanceof WarcMetadata rsp) {
|
||||
System.out.println("meta:" + rsp.target());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
|
||||
while (ds.hasNext()) {
|
||||
var doc = ds.next();
|
||||
if (doc instanceof CrawledDomain dr) {
|
||||
System.out.println(dr.domain + "/" + dr.crawlerStatus);
|
||||
}
|
||||
else if (doc instanceof CrawledDocument dc) {
|
||||
System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus + "/" + dc.timestamp);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private void simulatePartialWrite(Path tempFileWarc2) throws IOException {
|
||||
try (var raf = new RandomAccessFile(tempFileWarc2.toFile(), "rw")) {
|
||||
raf.setLength(raf.length() - 10);
|
||||
}
|
||||
}
|
||||
|
||||
private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2)) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(),
|
||||
new CrawlDataReference(stream));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||
} catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
}
|
||||
}
|
@ -47,6 +47,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:features-search:screenshots')
|
||||
implementation project(':code:features-search:random-websites')
|
||||
implementation project(':code:features-search:feedlot-client')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -1,10 +1,15 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Provides;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.renderer.config.HandlebarsConfigurator;
|
||||
import nu.marginalia.feedlot.FeedlotClient;
|
||||
|
||||
import java.time.Duration;
|
||||
|
||||
public class SearchModule extends AbstractModule {
|
||||
|
||||
@ -17,4 +22,14 @@ public class SearchModule extends AbstractModule {
|
||||
System.getProperty("website-url", "https://search.marginalia.nu/")));
|
||||
}
|
||||
|
||||
@Provides
|
||||
public FeedlotClient provideFeedlotClient() {
|
||||
return new FeedlotClient(
|
||||
System.getProperty("ext-svc-feedlot-host", "feedlot"),
|
||||
Integer.getInteger("ext-svc-feedlot-port", 80),
|
||||
GsonFactory.get(),
|
||||
Duration.ofMillis(250),
|
||||
Duration.ofMillis(100)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -5,13 +5,17 @@ import nu.marginalia.assistant.client.AssistantClient;
|
||||
import nu.marginalia.assistant.client.model.SimilarDomain;
|
||||
import nu.marginalia.client.Context;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.feedlot.model.FeedItems;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.assistant.client.model.DomainInformation;
|
||||
import nu.marginalia.feedlot.FeedlotClient;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
@ -21,19 +25,23 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class SearchSiteInfoService {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchSiteInfoService.class);
|
||||
|
||||
private final SearchOperator searchOperator;
|
||||
private final AssistantClient assistantClient;
|
||||
private final SearchFlagSiteService flagSiteService;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final MustacheRenderer<Object> renderer;
|
||||
private final FeedlotClient feedlotClient;
|
||||
|
||||
@Inject
|
||||
public SearchSiteInfoService(SearchOperator searchOperator,
|
||||
AssistantClient assistantClient,
|
||||
RendererFactory rendererFactory,
|
||||
SearchFlagSiteService flagSiteService,
|
||||
DbDomainQueries domainQueries) throws IOException {
|
||||
DbDomainQueries domainQueries,
|
||||
FeedlotClient feedlotClient) throws IOException
|
||||
{
|
||||
this.searchOperator = searchOperator;
|
||||
this.assistantClient = assistantClient;
|
||||
this.flagSiteService = flagSiteService;
|
||||
@ -41,6 +49,7 @@ public class SearchSiteInfoService {
|
||||
|
||||
this.renderer = rendererFactory.renderer("search/site-info/site-info");
|
||||
|
||||
this.feedlotClient = feedlotClient;
|
||||
}
|
||||
|
||||
public Object handle(Request request, Response response) throws SQLException {
|
||||
@ -121,6 +130,7 @@ public class SearchSiteInfoService {
|
||||
final List<SimilarDomain> linkingDomains;
|
||||
String url = "https://" + domainName + "/";;
|
||||
|
||||
var feedItemsFuture = feedlotClient.getFeedItems(domainName);
|
||||
if (domainId < 0 || !assistantClient.isAccepting()) {
|
||||
domainInfo = createDummySiteInfo(domainName);
|
||||
similarSet = List.of();
|
||||
@ -134,11 +144,18 @@ public class SearchSiteInfoService {
|
||||
linkingDomains = assistantClient
|
||||
.linkedDomains(ctx, domainId, 100)
|
||||
.blockingFirst();
|
||||
}
|
||||
|
||||
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(ctx, domainName, 1);
|
||||
if (!sampleResults.isEmpty()) {
|
||||
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
|
||||
}
|
||||
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(ctx, domainName, 5);
|
||||
if (!sampleResults.isEmpty()) {
|
||||
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
|
||||
}
|
||||
|
||||
FeedItems feedItems = null;
|
||||
try {
|
||||
feedItems = feedItemsFuture.get();
|
||||
} catch (Exception e) {
|
||||
logger.debug("Failed to get feed items for {}: {}", domainName, e.getMessage());
|
||||
}
|
||||
|
||||
return new SiteInfoWithContext(domainName,
|
||||
@ -146,7 +163,9 @@ public class SearchSiteInfoService {
|
||||
url,
|
||||
domainInfo,
|
||||
similarSet,
|
||||
linkingDomains
|
||||
linkingDomains,
|
||||
feedItems,
|
||||
sampleResults
|
||||
);
|
||||
}
|
||||
|
||||
@ -200,13 +219,18 @@ public class SearchSiteInfoService {
|
||||
String siteUrl,
|
||||
DomainInformation domainInformation,
|
||||
List<SimilarDomain> similar,
|
||||
List<SimilarDomain> linking) {
|
||||
List<SimilarDomain> linking,
|
||||
FeedItems feed,
|
||||
List<UrlDetails> samples
|
||||
) {
|
||||
public SiteInfoWithContext(String domain,
|
||||
long domainId,
|
||||
String siteUrl,
|
||||
DomainInformation domainInformation,
|
||||
List<SimilarDomain> similar,
|
||||
List<SimilarDomain> linking
|
||||
List<SimilarDomain> linking,
|
||||
FeedItems feedInfo,
|
||||
List<UrlDetails> samples
|
||||
)
|
||||
{
|
||||
this(Map.of("info", true),
|
||||
@ -216,7 +240,9 @@ public class SearchSiteInfoService {
|
||||
siteUrl,
|
||||
domainInformation,
|
||||
similar,
|
||||
linking);
|
||||
linking,
|
||||
feedInfo,
|
||||
samples);
|
||||
}
|
||||
|
||||
public String getLayout() {
|
||||
@ -224,6 +250,12 @@ public class SearchSiteInfoService {
|
||||
if (similar.size() < 25) {
|
||||
return "lopsided";
|
||||
}
|
||||
else if (!feed.items().isEmpty()) {
|
||||
return "lopsided";
|
||||
}
|
||||
else if (!samples.isEmpty()) {
|
||||
return "lopsided";
|
||||
}
|
||||
else {
|
||||
return "balanced";
|
||||
}
|
||||
|
@ -0,0 +1,17 @@
|
||||
<?xml version="1.0"?>
|
||||
<!-- CC0 -->
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg version="1.1" id="Capa_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
viewBox="0 0 455.731 455.731" xml:space="preserve">
|
||||
<g>
|
||||
<rect x="0" y="0" style="fill:#F78422;" width="455.731" height="455.731"/>
|
||||
<g>
|
||||
<path style="fill:#FFFFFF;" d="M296.208,159.16C234.445,97.397,152.266,63.382,64.81,63.382v64.348
|
||||
c70.268,0,136.288,27.321,185.898,76.931c49.609,49.61,76.931,115.63,76.931,185.898h64.348
|
||||
C391.986,303.103,357.971,220.923,296.208,159.16z"/>
|
||||
<path style="fill:#FFFFFF;" d="M64.143,172.273v64.348c84.881,0,153.938,69.056,153.938,153.939h64.348
|
||||
C282.429,270.196,184.507,172.273,64.143,172.273z"/>
|
||||
<circle style="fill:#FFFFFF;" cx="109.833" cy="346.26" r="46.088"/>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 891 B |
@ -0,0 +1,20 @@
|
||||
{{#with feed}}
|
||||
<h2><a title="Atom/RSS feed" target="external" href="{{feedUrl}}"><img width="16" height="16" src="/rss.svg"></a> Feed (Experimental)</h2>
|
||||
|
||||
<dl>
|
||||
{{#each items}}
|
||||
<dt><a href="{{url}}" rel="external noopener ugc">{{title}}</a></dt>
|
||||
<dd><date>{{pubDay}}</date><br>{{{descriptionSafe}}}</dd>
|
||||
{{/each}}
|
||||
</dl>
|
||||
{{/with}}
|
||||
|
||||
{{#unless feed}}{{#if samples}}
|
||||
<h2>Sample</h2>
|
||||
<dl>
|
||||
{{#each samples}}
|
||||
<dt><a href="{{url}}" rel="external noopener ugc">{{title}}</a></dt>
|
||||
<dd>{{{description}}}</dd>
|
||||
{{/each}}
|
||||
</dl>
|
||||
{{/if}}{{/unless}}
|
@ -12,11 +12,58 @@
|
||||
<img class="screenshot" width="300" height="225" src="/screenshot/{{domainId}}" alt="Screenshot of {{domain}}" />
|
||||
</a>
|
||||
{{#with domainInformation}}
|
||||
{{> search/site-info/site-info-feed}}
|
||||
{{> search/site-info/site-info-index}}
|
||||
{{> search/site-info/site-info-links}}
|
||||
{{/with}}
|
||||
</div>
|
||||
|
||||
{{#if linking}}
|
||||
<div id="similar-links">
|
||||
<h2>Linking Domains</h2>
|
||||
|
||||
<table class="similarity-table">
|
||||
<tr>
|
||||
<th colspan="3">Meta</th>
|
||||
<th>Rank</th>
|
||||
<th>Domain</th>
|
||||
<th>Similarity</th>
|
||||
</tr>
|
||||
{{#each linking}}
|
||||
<tr>
|
||||
<td>
|
||||
{{#if indexed}}
|
||||
{{#if active}}
|
||||
<span title="Indexed">👀</span>
|
||||
{{/if}}
|
||||
{{#unless active}}
|
||||
<span title="Problem">🔥</span>
|
||||
{{/unless}}
|
||||
{{/if}}
|
||||
</td>
|
||||
<td>
|
||||
{{#if screenshot}}📷{{/if}}
|
||||
</td>
|
||||
<td>
|
||||
{{#if linkType.isLinked}}
|
||||
<span title="{{linkType.description}}"><a href="/crosstalk/?domains={{domain}},{{url.domain}}">{{{linkType}}}</a></span>
|
||||
{{/if}}
|
||||
</td>
|
||||
<td>
|
||||
<span title="{{rank}}%">{{{rankSymbols}}}</span>
|
||||
</td>
|
||||
<td>
|
||||
<a href="/site/{{url.domain}}?view=similar" rel="external noopener nofollow">{{url.domain}}</a></td>
|
||||
<td>
|
||||
<progress value="{{relatedness}}" max="100.0">{{relatedness}}</progress><br>
|
||||
</td>
|
||||
</tr>
|
||||
{{/each}}
|
||||
</table>
|
||||
</div>
|
||||
{{/if}}
|
||||
|
||||
|
||||
{{#if similar}}
|
||||
<div id="similar-domains">
|
||||
<h2>Similar Domains</h2>
|
||||
@ -67,48 +114,4 @@
|
||||
</div>
|
||||
{{/if}}
|
||||
|
||||
{{#if linking}}
|
||||
<div id="similar-links">
|
||||
<h2>Linking Domains</h2>
|
||||
|
||||
<table class="similarity-table">
|
||||
<tr>
|
||||
<th colspan="3">Meta</th>
|
||||
<th>Rank</th>
|
||||
<th>Domain</th>
|
||||
<th>Similarity</th>
|
||||
</tr>
|
||||
{{#each linking}}
|
||||
<tr>
|
||||
<td>
|
||||
{{#if indexed}}
|
||||
{{#if active}}
|
||||
<span title="Indexed">👀</span>
|
||||
{{/if}}
|
||||
{{#unless active}}
|
||||
<span title="Problem">🔥</span>
|
||||
{{/unless}}
|
||||
{{/if}}
|
||||
</td>
|
||||
<td>
|
||||
{{#if screenshot}}📷{{/if}}
|
||||
</td>
|
||||
<td>
|
||||
{{#if linkType.isLinked}}
|
||||
<span title="{{linkType.description}}"><a href="/crosstalk/?domains={{domain}},{{url.domain}}">{{{linkType}}}</a></span>
|
||||
{{/if}}
|
||||
</td>
|
||||
<td>
|
||||
<span title="{{rank}}%">{{{rankSymbols}}}</span>
|
||||
</td>
|
||||
<td>
|
||||
<a href="/site/{{url.domain}}?view=similar" rel="external noopener nofollow">{{url.domain}}</a></td>
|
||||
<td>
|
||||
<progress value="{{relatedness}}" max="100.0">{{relatedness}}</progress><br>
|
||||
</td>
|
||||
</tr>
|
||||
{{/each}}
|
||||
</table>
|
||||
</div>
|
||||
{{/if}}
|
||||
</div>
|
@ -28,6 +28,7 @@ include 'code:libraries:message-queue'
|
||||
|
||||
include 'code:features-search:screenshots'
|
||||
include 'code:features-search:random-websites'
|
||||
include 'code:features-search:feedlot-client'
|
||||
include 'code:features-qs:query-parser'
|
||||
include 'code:features-index:result-ranking'
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user