diff --git a/code/features-search/feedlot-client/build.gradle b/code/features-search/feedlot-client/build.gradle new file mode 100644 index 00000000..808c9ca6 --- /dev/null +++ b/code/features-search/feedlot-client/build.gradle @@ -0,0 +1,22 @@ +plugins { + id 'java' + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +dependencies { + implementation libs.bundles.slf4j + + implementation libs.notnull + implementation libs.gson + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} \ No newline at end of file diff --git a/code/features-search/feedlot-client/readme.md b/code/features-search/feedlot-client/readme.md new file mode 100644 index 00000000..76fafff8 --- /dev/null +++ b/code/features-search/feedlot-client/readme.md @@ -0,0 +1,20 @@ +Client for [FeedlotTheFeedBot](https://github.com/MarginaliaSearch/FeedLotTheFeedBot), +the RSS/Atom feed fetcher and cache for Marginalia Search. + +This service is external to the Marginalia Search codebase, +as it is not a core part of the search engine and has other +utilities. + +## Example + +```java + +import java.time.Duration; + +var client = new FeedlotClient("localhost", 8080, + gson, + Duration.ofMillis(100), // connect timeout + Duration.ofMillis(100)); // request timeout + +CompleteableFuture items = client.getFeedItems("www.marginalia.nu"); +``` \ No newline at end of file diff --git a/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/FeedlotClient.java b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/FeedlotClient.java new file mode 100644 index 00000000..3392a8d2 --- /dev/null +++ b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/FeedlotClient.java @@ -0,0 +1,58 @@ +package nu.marginalia.feedlot; + +import com.google.gson.Gson; +import nu.marginalia.feedlot.model.FeedItems; + +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.time.Duration; +import java.util.concurrent.Executors; +import java.util.concurrent.CompletableFuture; + +public class FeedlotClient { + private final String feedlotHost; + private final int feedlotPort; + private final Gson gson; + private final HttpClient httpClient; + private final Duration requestTimeout; + + public FeedlotClient(String feedlotHost, + int feedlotPort, + Gson gson, + Duration connectTimeout, + Duration requestTimeout + ) + { + this.feedlotHost = feedlotHost; + this.feedlotPort = feedlotPort; + this.gson = gson; + + httpClient = HttpClient.newBuilder() + .executor(Executors.newVirtualThreadPerTaskExecutor()) + .connectTimeout(connectTimeout) + .build(); + this.requestTimeout = requestTimeout; + } + + public CompletableFuture getFeedItems(String domainName) { + return httpClient.sendAsync( + HttpRequest.newBuilder() + .uri(URI.create("http://%s:%d/feed/%s".formatted(feedlotHost, feedlotPort, domainName))) + .GET() + .timeout(requestTimeout) + .build(), + HttpResponse.BodyHandlers.ofString() + ).thenApply(HttpResponse::body) + .thenApply(this::parseFeedItems); + } + + private FeedItems parseFeedItems(String s) { + return gson.fromJson(s, FeedItems.class); + } + + public void stop() { + httpClient.close(); + } +} diff --git a/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItem.java b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItem.java new file mode 100644 index 00000000..95ea8fe3 --- /dev/null +++ b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItem.java @@ -0,0 +1,17 @@ +package nu.marginalia.feedlot.model; + +public record FeedItem(String title, String date, String description, String url) { + + public String pubDay() { // Extract the date from an ISO style date string + if (date.length() > 10) { + return date.substring(0, 10); + } + return date; + } + + public String descriptionSafe() { + return description + .replace("<", "<") + .replace(">", ">"); + } +} diff --git a/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItems.java b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItems.java new file mode 100644 index 00000000..fcf06345 --- /dev/null +++ b/code/features-search/feedlot-client/src/main/java/nu/marginalia/feedlot/model/FeedItems.java @@ -0,0 +1,6 @@ +package nu.marginalia.feedlot.model; + +import java.util.List; + +public record FeedItems(String domain, String feedUrl, String updated, List items) { +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java index e8050830..bfbcab14 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java @@ -40,8 +40,8 @@ public class CrawlerWarcResynchronizer { for (var item : reader) { accept(item); } - } catch (IOException e) { - logger.info(STR."Failed read full warc file \{tempFile}", e); + } catch (Exception e) { + logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}"); } // Second pass, copy records to the new warc file @@ -49,8 +49,8 @@ public class CrawlerWarcResynchronizer { for (var item : reader) { recorder.resync(item); } - } catch (IOException e) { - logger.info(STR."Failed read full warc file \{tempFile}", e); + } catch (Exception e) { + logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}"); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index aad015d7..9bd14ab6 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -37,7 +37,7 @@ public class WarcRecorder implements AutoCloseable { private final Path warcFile; private static final Logger logger = LoggerFactory.getLogger(WarcRecorder.class); - private final ThreadLocal bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]); + private final static ThreadLocal bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]); private boolean temporaryFile = false; diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index c2dc70d4..d5cf0c9c 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -4,23 +4,24 @@ import lombok.SneakyThrows; import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; import nu.marginalia.atags.model.DomainLinks; -import nu.marginalia.crawl.retreival.CrawlDataReference; -import nu.marginalia.crawl.retreival.CrawlerRetreiver; -import nu.marginalia.crawl.retreival.DomainProber; +import nu.marginalia.crawl.retreival.*; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; -import nu.marginalia.io.crawlspec.CrawlSpecRecordParquetFileWriter; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.junit.jupiter.api.*; import org.netpreserve.jwarc.*; import java.io.IOException; +import java.io.RandomAccessFile; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; @@ -37,6 +38,7 @@ class CrawlerRetreiverTest { Path tempFileParquet1; Path tempFileWarc2; Path tempFileParquet2; + Path tempFileWarc3; @BeforeEach public void setUp() throws IOException { httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D"); @@ -66,7 +68,11 @@ class CrawlerRetreiverTest { if (tempFileParquet2 != null) { Files.deleteIfExists(tempFileParquet2); } + if (tempFileWarc3 != null) { + Files.deleteIfExists(tempFileWarc3); + } } + @Test public void testWarcOutput() throws IOException { var specs = CrawlSpecRecord @@ -79,11 +85,7 @@ class CrawlerRetreiverTest { try { tempFile = Files.createTempFile("crawling-process", "warc"); - try (var recorder = new WarcRecorder(tempFile)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); - } catch (IOException ex) { - Assertions.fail(ex); - } + doCrawl(tempFile, specs); Set requests = new HashSet<>(); Set responses = new HashSet<>(); @@ -112,6 +114,57 @@ class CrawlerRetreiverTest { Files.deleteIfExists(tempFile); } } + + @SneakyThrows + @Test + public void testResync() throws IOException { + var specs = CrawlSpecRecord + .builder() + .crawlDepth(5) + .domain("www.marginalia.nu") + .urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/")) + .build(); + tempFileWarc1 = Files.createTempFile("crawling-process", "warc"); + tempFileWarc2 = Files.createTempFile("crawling-process", "warc"); + + doCrawl(tempFileWarc1, specs); + + Set requests = new HashSet<>(); + Set responses = new HashSet<>(); + + var revisitCrawlFrontier = new DomainCrawlFrontier( + new EdgeDomain("www.marginalia.nu"), + List.of(), 100); + var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier, + new WarcRecorder(tempFileWarc2) + ); + + // truncate the size of the file to simulate a crash + simulatePartialWrite(tempFileWarc1); + + resync.run(tempFileWarc1); + assertTrue(revisitCrawlFrontier.addKnown(new EdgeUrl("https://www.marginalia.nu/misc/debian-laptop-install-log/"))); + + try (var reader = new WarcReader(tempFileWarc2)) { + reader.forEach(record -> { + if (record instanceof WarcRequest req) { + requests.add(req.target()); + System.out.println(req.type() + ":" + req.target()); + } + else if (record instanceof WarcResponse rsp) { + responses.add(rsp.target()); + System.out.println(rsp.type() + ":" + rsp.target()); + } + else { + System.out.println(record.type()); + } + }); + } + + assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/")); + assertEquals(requests, responses); + } + @Test public void testWithKnownDomains() throws IOException { var specs = CrawlSpecRecord @@ -125,15 +178,9 @@ class CrawlerRetreiverTest { tempFileWarc1 = Files.createTempFile("crawling-process", ".warc"); - try (var recorder = new WarcRecorder(tempFileWarc1)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); - } - catch (IOException ex) { - Assertions.fail(ex); - } + doCrawl(tempFileWarc1, specs); - CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu", - new UserAgent("test"), tempFileWarc1, tempFileParquet1); + convertToParquet(tempFileWarc1, tempFileParquet1); try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) { while (stream.hasNext()) { @@ -177,16 +224,8 @@ class CrawlerRetreiverTest { tempFileWarc1 = Files.createTempFile("crawling-process", ".warc"); - try (var recorder = new WarcRecorder(tempFileWarc1)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); - } - catch (IOException ex) { - Assertions.fail(ex); - } - - CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu", - new UserAgent("test"), tempFileWarc1, tempFileParquet1); - + doCrawl(tempFileWarc1, specs); + convertToParquet(tempFileWarc1, tempFileParquet1); try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) { while (stream.hasNext()) { @@ -232,46 +271,11 @@ class CrawlerRetreiverTest { tempFileWarc1 = Files.createTempFile("crawling-process", ".warc.gz"); tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz"); - Map, List> data = new HashMap<>(); - - try (var recorder = new WarcRecorder(tempFileWarc1)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); - } - catch (IOException ex) { - Assertions.fail(ex); - } - - CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu", - new UserAgent("test"), tempFileWarc1, tempFileParquet1); - - try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) { - while (stream.hasNext()) { - var doc = stream.next(); - data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc); - } - } catch (Exception e) { - throw new RuntimeException(e); - } - - - var stream = CrawledDomainReader.createDataStream(tempFileParquet1); - - System.out.println("---"); - - CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); - domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); - try (var recorder = new WarcRecorder(tempFileWarc2)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(), - new CrawlDataReference(stream)); - } - catch (IOException ex) { - Assertions.fail(ex); - } - - - CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu", - new UserAgent("test"), tempFileWarc2, tempFileParquet2); - + doCrawl(tempFileWarc1, specs); + doCrawlWithReferenceStream(specs, + CrawledDomainReader.createDataStream(tempFileParquet1) + ); + convertToParquet(tempFileWarc2, tempFileParquet2); try (var reader = new WarcReader(tempFileWarc2)) { WarcXResponseReference.register(reader); @@ -304,4 +308,120 @@ class CrawlerRetreiverTest { throw new RuntimeException(e); } } + + private void convertToParquet(Path tempFileWarc2, Path tempFileParquet2) { + CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu", + new UserAgent("test"), tempFileWarc2, tempFileParquet2); + } + + + @SneakyThrows + @Test + public void testRecrawlWithResync() throws IOException { + + var specs = CrawlSpecRecord + .builder() + .crawlDepth(12) + .domain("www.marginalia.nu") + .urls(List.of("https://www.marginalia.nu/some-dead-link")) + .build(); + + + tempFileWarc1 = Files.createTempFile("crawling-process", ".warc.gz"); + tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz"); + tempFileWarc3 = Files.createTempFile("crawling-process", ".warc.gz"); + + Map, List> data = new HashMap<>(); + + doCrawl(tempFileWarc1, specs); + + convertToParquet(tempFileWarc1, tempFileParquet1); + + try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) { + while (stream.hasNext()) { + var doc = stream.next(); + data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + + var stream = CrawledDomainReader.createDataStream(tempFileParquet1); + + System.out.println("---"); + + doCrawlWithReferenceStream(specs, stream); + + var revisitCrawlFrontier = new DomainCrawlFrontier( + new EdgeDomain("www.marginalia.nu"), + List.of(), 100); + + var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier, + new WarcRecorder(tempFileWarc3) + ); + + // truncate the size of the file to simulate a crash + simulatePartialWrite(tempFileWarc2); + + resync.run(tempFileWarc2); + + assertTrue(revisitCrawlFrontier.addKnown(new EdgeUrl("https://www.marginalia.nu/"))); + convertToParquet(tempFileWarc3, tempFileParquet2); + + + try (var reader = new WarcReader(tempFileWarc3)) { + WarcXResponseReference.register(reader); + + reader.forEach(record -> { + if (record instanceof WarcResponse rsp) { + try { + System.out.println(rsp.type() + ":" + rsp.target() + "/" + rsp.http().status()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + if (record instanceof WarcMetadata rsp) { + System.out.println("meta:" + rsp.target()); + } + }); + } + + try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) { + while (ds.hasNext()) { + var doc = ds.next(); + if (doc instanceof CrawledDomain dr) { + System.out.println(dr.domain + "/" + dr.crawlerStatus); + } + else if (doc instanceof CrawledDocument dc) { + System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus + "/" + dc.timestamp); + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private void simulatePartialWrite(Path tempFileWarc2) throws IOException { + try (var raf = new RandomAccessFile(tempFileWarc2.toFile(), "rw")) { + raf.setLength(raf.length() - 10); + } + } + + private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) { + try (var recorder = new WarcRecorder(tempFileWarc2)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(), + new CrawlDataReference(stream)); + } + catch (IOException ex) { + Assertions.fail(ex); + } + } + + private void doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) { + try (var recorder = new WarcRecorder(tempFileWarc1)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); + } catch (IOException ex) { + Assertions.fail(ex); + } + } } \ No newline at end of file diff --git a/code/services-application/search-service/build.gradle b/code/services-application/search-service/build.gradle index 805a7b34..ee504bcb 100644 --- a/code/services-application/search-service/build.gradle +++ b/code/services-application/search-service/build.gradle @@ -47,6 +47,7 @@ dependencies { implementation project(':code:features-search:screenshots') implementation project(':code:features-search:random-websites') + implementation project(':code:features-search:feedlot-client') implementation libs.bundles.slf4j diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchModule.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchModule.java index 090884ba..d832503c 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchModule.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchModule.java @@ -1,10 +1,15 @@ package nu.marginalia.search; import com.google.inject.AbstractModule; +import com.google.inject.Provides; import nu.marginalia.LanguageModels; import nu.marginalia.WebsiteUrl; import nu.marginalia.WmsaHome; +import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.renderer.config.HandlebarsConfigurator; +import nu.marginalia.feedlot.FeedlotClient; + +import java.time.Duration; public class SearchModule extends AbstractModule { @@ -17,4 +22,14 @@ public class SearchModule extends AbstractModule { System.getProperty("website-url", "https://search.marginalia.nu/"))); } + @Provides + public FeedlotClient provideFeedlotClient() { + return new FeedlotClient( + System.getProperty("ext-svc-feedlot-host", "feedlot"), + Integer.getInteger("ext-svc-feedlot-port", 80), + GsonFactory.get(), + Duration.ofMillis(250), + Duration.ofMillis(100) + ); + } } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java index badaaeed..290bef50 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -5,13 +5,17 @@ import nu.marginalia.assistant.client.AssistantClient; import nu.marginalia.assistant.client.model.SimilarDomain; import nu.marginalia.client.Context; import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.feedlot.model.FeedItems; import nu.marginalia.model.EdgeDomain; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.search.SearchOperator; import nu.marginalia.assistant.client.model.DomainInformation; +import nu.marginalia.feedlot.FeedlotClient; import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import spark.Request; import spark.Response; @@ -21,19 +25,23 @@ import java.util.List; import java.util.Map; public class SearchSiteInfoService { + private static final Logger logger = LoggerFactory.getLogger(SearchSiteInfoService.class); private final SearchOperator searchOperator; private final AssistantClient assistantClient; private final SearchFlagSiteService flagSiteService; private final DbDomainQueries domainQueries; private final MustacheRenderer renderer; + private final FeedlotClient feedlotClient; @Inject public SearchSiteInfoService(SearchOperator searchOperator, AssistantClient assistantClient, RendererFactory rendererFactory, SearchFlagSiteService flagSiteService, - DbDomainQueries domainQueries) throws IOException { + DbDomainQueries domainQueries, + FeedlotClient feedlotClient) throws IOException + { this.searchOperator = searchOperator; this.assistantClient = assistantClient; this.flagSiteService = flagSiteService; @@ -41,6 +49,7 @@ public class SearchSiteInfoService { this.renderer = rendererFactory.renderer("search/site-info/site-info"); + this.feedlotClient = feedlotClient; } public Object handle(Request request, Response response) throws SQLException { @@ -121,6 +130,7 @@ public class SearchSiteInfoService { final List linkingDomains; String url = "https://" + domainName + "/";; + var feedItemsFuture = feedlotClient.getFeedItems(domainName); if (domainId < 0 || !assistantClient.isAccepting()) { domainInfo = createDummySiteInfo(domainName); similarSet = List.of(); @@ -134,11 +144,18 @@ public class SearchSiteInfoService { linkingDomains = assistantClient .linkedDomains(ctx, domainId, 100) .blockingFirst(); + } - List sampleResults = searchOperator.doSiteSearch(ctx, domainName, 1); - if (!sampleResults.isEmpty()) { - url = sampleResults.getFirst().url.withPathAndParam("/", null).toString(); - } + List sampleResults = searchOperator.doSiteSearch(ctx, domainName, 5); + if (!sampleResults.isEmpty()) { + url = sampleResults.getFirst().url.withPathAndParam("/", null).toString(); + } + + FeedItems feedItems = null; + try { + feedItems = feedItemsFuture.get(); + } catch (Exception e) { + logger.debug("Failed to get feed items for {}: {}", domainName, e.getMessage()); } return new SiteInfoWithContext(domainName, @@ -146,7 +163,9 @@ public class SearchSiteInfoService { url, domainInfo, similarSet, - linkingDomains + linkingDomains, + feedItems, + sampleResults ); } @@ -200,13 +219,18 @@ public class SearchSiteInfoService { String siteUrl, DomainInformation domainInformation, List similar, - List linking) { + List linking, + FeedItems feed, + List samples + ) { public SiteInfoWithContext(String domain, long domainId, String siteUrl, DomainInformation domainInformation, List similar, - List linking + List linking, + FeedItems feedInfo, + List samples ) { this(Map.of("info", true), @@ -216,7 +240,9 @@ public class SearchSiteInfoService { siteUrl, domainInformation, similar, - linking); + linking, + feedInfo, + samples); } public String getLayout() { @@ -224,6 +250,12 @@ public class SearchSiteInfoService { if (similar.size() < 25) { return "lopsided"; } + else if (!feed.items().isEmpty()) { + return "lopsided"; + } + else if (!samples.isEmpty()) { + return "lopsided"; + } else { return "balanced"; } diff --git a/code/services-application/search-service/src/main/resources/static/search/rss.svg b/code/services-application/search-service/src/main/resources/static/search/rss.svg new file mode 100644 index 00000000..2c01c8b3 --- /dev/null +++ b/code/services-application/search-service/src/main/resources/static/search/rss.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-feed.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-feed.hdb new file mode 100644 index 00000000..f458e380 --- /dev/null +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-feed.hdb @@ -0,0 +1,20 @@ +{{#with feed}} +

Feed (Experimental)

+ +
+ {{#each items}} +
{{title}}
+
{{pubDay}}
{{{descriptionSafe}}}
+ {{/each}} +
+{{/with}} + +{{#unless feed}}{{#if samples}} +

Sample

+
+{{#each samples}} +
{{title}}
+
{{{description}}}
+{{/each}} +
+{{/if}}{{/unless}} \ No newline at end of file diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-summary.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-summary.hdb index fd1c7590..fba7adad 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-summary.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-summary.hdb @@ -12,11 +12,58 @@ Screenshot of {{domain}} {{#with domainInformation}} + {{> search/site-info/site-info-feed}} {{> search/site-info/site-info-index}} {{> search/site-info/site-info-links}} {{/with}} + {{#if linking}} + + {{/if}} + + {{#if similar}}

Similar Domains

@@ -67,48 +114,4 @@
{{/if}} - {{#if linking}} - - {{/if}} \ No newline at end of file diff --git a/settings.gradle b/settings.gradle index 42ae0f47..dbc0c855 100644 --- a/settings.gradle +++ b/settings.gradle @@ -28,6 +28,7 @@ include 'code:libraries:message-queue' include 'code:features-search:screenshots' include 'code:features-search:random-websites' +include 'code:features-search:feedlot-client' include 'code:features-qs:query-parser' include 'code:features-index:result-ranking'