diff --git a/marginalia_nu/src/e2e/resources/nginx/search.conf b/marginalia_nu/src/e2e/resources/nginx/search.conf index edbd5e40..341889c1 100644 --- a/marginalia_nu/src/e2e/resources/nginx/search.conf +++ b/marginalia_nu/src/e2e/resources/nginx/search.conf @@ -3,7 +3,7 @@ server { listen [::]:80; server_name nginx; - location /search { + location / { if ( $request_method = POST ) { return 444; } @@ -14,12 +14,7 @@ server { proxy_set_header X-Extern-Domain $scheme://$host; proxy_set_header X-User-Agent $http_user_agent; - proxy_pass http://edge-search:5023/public/search; - tcp_nodelay on; - } - - location / { - proxy_pass http://edge-search:5023/; + proxy_pass http://edge-search:5023/public/; tcp_nodelay on; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java index 440f21ac..9f20a65f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java @@ -5,6 +5,8 @@ import com.google.inject.Singleton; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; import java.util.HashSet; import java.util.List; @@ -41,14 +43,17 @@ public class FeatureExtractor { } public Set getFeatures(CrawledDomain domain, Document doc) { - Set features = new HashSet<>(); + final Set features = new HashSet<>(); - var scriptTags = doc.getElementsByTag("script"); + final Elements scriptTags = doc.getElementsByTag("script"); - if (scriptTags.size() > 0) { - features.add(HtmlFeature.JS); + for (var scriptTag : scriptTags) { + if (isJavascriptTag(scriptTag)) { + features.add(HtmlFeature.JS); + } } - else if(adblockSimulator.hasAds(doc.clone())) { // Only look for ads if there is javascript + + if (features.contains(HtmlFeature.JS) && adblockSimulator.hasAds(doc.clone())) { features.add(HtmlFeature.ADVERTISEMENT); } @@ -58,20 +63,22 @@ public class FeatureExtractor { features.add(HtmlFeature.MEDIA); } - if (scriptTags.stream() - .anyMatch(tag -> trackers.stream().anyMatch(tracker -> tag.attr("src").contains(tracker)))) { - features.add(HtmlFeature.TRACKING); + for (var scriptTag : scriptTags) { + if (hasTrackingScript(scriptTag)) { + features.add(HtmlFeature.TRACKING); + break; + } } if (scriptTags.html().contains("google-analytics.com")) { features.add(HtmlFeature.TRACKING); } - if (doc.getElementsByTag("a").stream().map(e -> e.attr("href")) - .map(String::toLowerCase) - .anyMatch(href -> - href.contains("amzn.to/") || (href.contains("amazon.com/") & href.contains("tag=")))) { - features.add(HtmlFeature.AFFILIATE_LINK); + for (var aTag : doc.getElementsByTag("a")) { + if (isAmazonAffiliateLink(aTag)) { + features.add(HtmlFeature.AFFILIATE_LINK); + break; + } } if (!domain.cookies.isEmpty()) { @@ -80,4 +87,34 @@ public class FeatureExtractor { return features; } + + private boolean hasTrackingScript(Element scriptTag) { + for (var tracker : trackers) { + if (scriptTag.attr("src").contains(tracker)) { + return true; + } + } + return false; + } + + private boolean isJavascriptTag(Element scriptTag) { + final String type = scriptTag.attr("type"); + + if ("application/ld+json".equalsIgnoreCase(type)) { + return false; + } + + return true; + } + + boolean isAmazonAffiliateLink(Element aTag) { + final String href = aTag.attr("href").toLowerCase(); + + if (href.contains("amzn.to/")) + return true; + if (href.contains("amazon.com/") && href.contains("tag=")) + return true; + + return false; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java index f6c9a5b6..dbb2dcfc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java @@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver; import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import okhttp3.ConnectionPool; import okhttp3.Dispatcher; import okhttp3.internal.Util; import org.slf4j.Logger; @@ -25,12 +26,14 @@ public class CrawlerMain implements AutoCloseable { private final WorkLog workLog; + private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS); + private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS, new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true))); private final UserAgent userAgent; private final ThreadPoolExecutor pool; - final int poolSize = 256; + final int poolSize = 512; final int poolQueueSize = 32; public CrawlerMain(EdgeCrawlPlan plan) throws Exception { @@ -67,9 +70,10 @@ public class CrawlerMain implements AutoCloseable { if (workLog.isJobFinished(specification.id)) return; - var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher); - try (var writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { + HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool); + try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) + { var retreiver = new CrawlerRetreiver(fetcher, specification, writer); int size = retreiver.fetch(); @@ -92,7 +96,6 @@ public class CrawlerMain implements AutoCloseable { AbortMonitor abortMonitor = AbortMonitor.getInstance(); - Semaphore taskSem = new Semaphore(poolSize); plan.forEachCrawlingSpecification(spec -> { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 786df800..14716fbf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -17,7 +17,6 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.InetAddress; -import java.net.URISyntaxException; import java.net.UnknownHostException; import java.time.LocalDateTime; import java.util.ArrayList; @@ -25,12 +24,19 @@ import java.util.HashSet; import java.util.LinkedList; import java.util.Optional; +import static java.lang.Math.max; +import static java.lang.Math.min; + public class CrawlerRetreiver { - private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000); + private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 250); + private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500); + private final LinkedList queue = new LinkedList<>(); private final HttpFetcher fetcher; - private final HashSet visited; - private final HashSet known; + + private final HashSet visited; + private final HashSet known; + private boolean slowDown = false; private final int depth; private final String id; @@ -64,15 +70,13 @@ public class CrawlerRetreiver { crawledDomainWriter = writer; for (String urlStr : specs.urls) { - EdgeUrl.parse(urlStr) - .filter(known::add) - .ifPresent(queue::addLast); + EdgeUrl.parse(urlStr).ifPresent(this::addToQueue); } if (queue.peek() != null) { var fst = queue.peek(); var root = fst.domain.toRootUrl(); - if (known.add(root)) + if (known.add(root.toString())) queue.addFirst(root); } } @@ -147,7 +151,7 @@ public class CrawlerRetreiver { continue; if (top.toString().length() > 255) continue; - if (!visited.add(top)) + if (!visited.add(top.toString())) continue; if (fetchDocument(top, crawlDelay)) { @@ -172,9 +176,7 @@ public class CrawlerRetreiver { crawledDomainWriter.accept(d); if (d.url != null) { - try { - visited.add(new EdgeUrl(d.url)); - } catch (URISyntaxException ex) {} + EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add); } } @@ -192,8 +194,7 @@ public class CrawlerRetreiver { private Optional fetchUrl(EdgeUrl top) { try { - - var doc = fetcher.fetchContent(top); + var doc = fetchContent(top); if (doc.documentBody != null) { @@ -217,6 +218,24 @@ public class CrawlerRetreiver { } + @SneakyThrows + private CrawledDocument fetchContent(EdgeUrl top) { + for (int i = 0; i < 2; i++) { + try { + return fetcher.fetchContent(top); + } + catch (RateLimitException ex) { + slowDown = true; + int delay = ex.retryAfter(); + if (delay > 0 && delay < 5000) { + Thread.sleep(delay); + } + } + } + + return createRetryError(top); + } + private String createHash(String documentBodyHash) { return hashMethod.hashUnencodedChars(documentBodyHash).toString(); } @@ -235,28 +254,29 @@ public class CrawlerRetreiver { baseUrl = linkParser.getBaseLink(parsed, baseUrl); for (var link : parsed.getElementsByTag("a")) { - linkParser.parseLink(baseUrl, link) - .filter(this::isSameDomain) - .filter(u -> !urlBlocklist.isUrlBlocked(u)) - .filter(u -> !urlBlocklist.isMailingListLink(u)) - .filter(known::add) - .ifPresent(queue::addLast); + linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue); } for (var link : parsed.getElementsByTag("frame")) { - linkParser.parseFrame(baseUrl, link) - .filter(this::isSameDomain) - .filter(u -> !urlBlocklist.isUrlBlocked(u)) - .filter(u -> !urlBlocklist.isMailingListLink(u)) - .filter(known::add) - .ifPresent(queue::addLast); + linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue); } for (var link : parsed.getElementsByTag("iframe")) { - linkParser.parseFrame(baseUrl, link) - .filter(this::isSameDomain) - .filter(u -> !urlBlocklist.isUrlBlocked(u)) - .filter(u -> !urlBlocklist.isMailingListLink(u)) - .filter(known::add) - .ifPresent(queue::addLast); + linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue); + } + } + + private void addToQueue(EdgeUrl url) { + if (!isSameDomain(url)) + return; + if (urlBlocklist.isUrlBlocked(url)) + return; + if (urlBlocklist.isMailingListLink(url)) + return; + // reduce memory usage by not growing queue huge when crawling large sites + if (queue.size() + visited.size() >= depth + 100) + return; + + if (known.add(url.toString())) { + queue.addLast(url); } } @@ -284,13 +304,24 @@ public class CrawlerRetreiver { if (spentTime > sleepTime) return; - Thread.sleep(Math.min(sleepTime-spentTime, 5000)); + Thread.sleep(min(sleepTime-spentTime, 5000)); + } + else if (slowDown) { + Thread.sleep( 1000); } else { - if (spentTime > DEFAULT_CRAWL_DELAY_MS) + // When no crawl delay is specified, lean toward twice the fetch+process time, + // within sane limits. This means slower servers get slower crawling, and faster + // servers get faster crawling. + + sleepTime = spentTime * 2; + sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS); + sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS); + + if (spentTime > sleepTime) return; - Thread.sleep(DEFAULT_CRAWL_DELAY_MS - spentTime); + Thread.sleep(sleepTime-spentTime); } } @@ -302,7 +333,14 @@ public class CrawlerRetreiver { .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) .build(); } - + private CrawledDocument createRetryError(EdgeUrl url) { + return CrawledDocument.builder() + .url(url.toString()) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(429) + .crawlerStatus(CrawlerDocumentStatus.ERROR.name()) + .build(); + } private CrawledDomain createErrorPostFromStatus(HttpFetcher.FetchResult ret) { String ip = findIp(domain); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/FastTerminatingSocketFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/FastTerminatingSocketFactory.java new file mode 100644 index 00000000..f9a889b1 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/FastTerminatingSocketFactory.java @@ -0,0 +1,51 @@ +package nu.marginalia.wmsa.edge.crawling.retreival; + +import javax.net.SocketFactory; +import java.io.IOException; +import java.net.InetAddress; +import java.net.Socket; + +public class FastTerminatingSocketFactory extends SocketFactory { + private static final SocketFactory delegate = SocketFactory.getDefault(); + + private void configure(Socket sock) throws IOException { + // Setting SO_LINGER to enabled but low reduces TIME_WAIT + // which can get pretty... bad when you're crawling + // and opening thousands of connections + sock.setSoLinger(true, 3); + } + + public Socket createSocket() throws IOException { + var sock = delegate.createSocket(); + configure(sock); + return sock; + } + + @Override + public Socket createSocket(String host, int port) throws IOException { + var sock = delegate.createSocket(host, port); + configure(sock); + return sock; + } + + @Override + public Socket createSocket(String host, int port, InetAddress localHost, int localPort) throws IOException { + var sock = delegate.createSocket(host, port, localHost, localPort); + configure(sock); + return sock; + } + + @Override + public Socket createSocket(InetAddress host, int port) throws IOException { + var sock = delegate.createSocket(host, port); + configure(sock); + return sock; + } + + @Override + public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort) throws IOException { + var sock = delegate.createSocket(address, port, localAddress, localPort); + configure(sock); + return sock; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index cb198e21..e7ec01b2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -13,10 +13,7 @@ import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic; import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import okhttp3.Dispatcher; -import okhttp3.OkHttpClient; -import okhttp3.Request; -import okhttp3.Response; +import okhttp3.*; import org.apache.commons.io.input.BOMInputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -66,15 +63,18 @@ public class HttpFetcher { } } + private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory(); @SneakyThrows - private OkHttpClient createClient(Dispatcher dispatcher) { + private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) { var builder = new OkHttpClient.Builder(); if (dispatcher != null) { builder.dispatcher(dispatcher); } return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0]) + .socketFactory(ftSocketFactory) .hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer()) + .connectionPool(pool) .cookieJar(cookies.getJar()) .followRedirects(true) .followSslRedirects(true) @@ -82,6 +82,7 @@ public class HttpFetcher { .readTimeout(10, TimeUnit.SECONDS) .writeTimeout(10, TimeUnit.SECONDS) .build(); + } public List getCookies() { @@ -93,13 +94,13 @@ public class HttpFetcher { } @Inject - public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher) { - this.client = createClient(dispatcher); + public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) { + this.client = createClient(dispatcher, connectionPool); this.userAgent = userAgent; } public HttpFetcher(@Named("user-agent") String userAgent) { - this.client = createClient(null); + this.client = createClient(null, new ConnectionPool()); this.userAgent = userAgent; } @@ -141,7 +142,7 @@ public class HttpFetcher { } @SneakyThrows - public CrawledDocument fetchContent(EdgeUrl url) { + public CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException { if (contentTypeLogic.isUrlLikeBinary(url)) { logger.debug("Probing suspected binary {}", url); @@ -192,13 +193,17 @@ public class HttpFetcher { .build(); } - private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException { + private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException, RateLimitException { var responseUrl = new EdgeUrl(rsp.request().url().toString()); if (!Objects.equals(responseUrl.domain, url.domain)) { return createRedirectResponse(url, rsp, responseUrl); } + if (rsp.code() == 429) { + throw new RateLimitException(rsp.header("Retry-After", "1000")); + } + var body = rsp.body(); if (null == body) { return createErrorResponse(url, rsp, CrawlerDocumentStatus.ERROR, "No body"); @@ -258,8 +263,6 @@ public class HttpFetcher { } - - public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { return fetchRobotsForProto("https", domain) .or(() -> fetchRobotsForProto("http", domain)) @@ -282,4 +285,5 @@ public class HttpFetcher { doc.contentType, userAgent); } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/RateLimitException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/RateLimitException.java new file mode 100644 index 00000000..ac28dca9 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/RateLimitException.java @@ -0,0 +1,21 @@ +package nu.marginalia.wmsa.edge.crawling.retreival; + +public class RateLimitException extends Exception { + private final String retryAfter; + + public RateLimitException(String retryAfter) { + this.retryAfter = retryAfter; + } + + @Override + public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; } + + public int retryAfter() { + try { + return Integer.parseInt(retryAfter); + } + catch (NumberFormatException ex) { + return 1000; + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/IndexCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/IndexCommand.java new file mode 100644 index 00000000..91fe9c7e --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/IndexCommand.java @@ -0,0 +1,40 @@ +package nu.marginalia.wmsa.edge.search.command; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.search.model.BrowseResultSet; +import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner; +import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; +import nu.marginalia.wmsa.renderer.mustache.RendererFactory; +import spark.Request; +import spark.Response; + +import java.io.IOException; + +@Singleton +public class IndexCommand { + + private final EdgeDataStoreDao dataStoreDao; + private final BrowseResultCleaner browseResultCleaner; + private final MustacheRenderer template; + private final EdgeDomainBlacklist blacklist; + @Inject + public IndexCommand(EdgeDataStoreDao dataStoreDao, RendererFactory rendererFactory, BrowseResultCleaner browseResultCleaner, EdgeDomainBlacklist blacklist) throws IOException { + this.dataStoreDao = dataStoreDao; + this.browseResultCleaner = browseResultCleaner; + + template = rendererFactory.renderer("edge/index"); + this.blacklist = blacklist; + } + + public String render(Request request, Response response) { + response.header("Cache-control", "public,max-age=3600"); + + var results = dataStoreDao.getRandomDomains(5, blacklist, 0); + results.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); + + return template.render(new BrowseResultSet(results.stream().limit(1).toList())); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreService.java index aa1f0aa7..e71c3c54 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/ResourceStoreService.java @@ -22,7 +22,6 @@ import spark.Spark; import spark.resource.ClassPathResource; import spark.staticfiles.MimeType; -import java.io.FileNotFoundException; import java.net.URLEncoder; import java.time.LocalDateTime; import java.time.ZoneOffset; @@ -35,6 +34,7 @@ public class ResourceStoreService extends Service { private final AuthClient authClient; private final ResourceEntityStore resourceStore; + private StaticResources staticResources; @Inject public ResourceStoreService(@Named("service-host") String ip, @@ -42,11 +42,13 @@ public class ResourceStoreService extends Service { AuthClient authClient, ResourceEntityStore resourceStore, Initialization initialization, - MetricsServer metricsServer + MetricsServer metricsServer, + StaticResources staticResources ) { super(ip, port, initialization, metricsServer); this.authClient = authClient; this.resourceStore = resourceStore; + this.staticResources = staticResources; Schedulers.io().schedulePeriodicallyDirect(resourceStore::reapStaleResources, 5, 5, TimeUnit.MINUTES); @@ -109,12 +111,9 @@ public class ResourceStoreService extends Service { return serveDynamic(data, request, response); } - else if (serveStatic(domain + "/" + resource, request, response)) { - logger.info("getResource({}/{}, static)", domain, resource); - } else { - logger.info("Could not serve {}/{}", domain, resource); - Spark.halt(404, "Not Found"); + logger.info("getResource({}/{}, static)", domain, resource); + staticResources.serveStatic(domain, resource, request, response); } return ""; } @@ -138,19 +137,7 @@ public class ResourceStoreService extends Service { return data.data; } - @SneakyThrows - private boolean serveStatic(String path, Request req, Response rsp) { - try { - ClassPathResource resource = new ClassPathResource("static/" + path); - handleEtagStatic(resource, req, rsp); - resource.getInputStream().transferTo(rsp.raw().getOutputStream()); - } - catch (IllegalArgumentException|FileNotFoundException ex) { - return false; - } - return true; - } @SneakyThrows private void handleEtag(RenderedResource page, Request req, Response rsp) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/StaticResources.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/StaticResources.java new file mode 100644 index 00000000..a3c2f756 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/resource_store/StaticResources.java @@ -0,0 +1,46 @@ +package nu.marginalia.wmsa.resource_store; + +import lombok.SneakyThrows; +import spark.Request; +import spark.Response; +import spark.Spark; +import spark.resource.ClassPathResource; +import spark.staticfiles.MimeType; + +import java.io.FileNotFoundException; +import java.time.LocalDateTime; +import java.time.ZoneOffset; + +public class StaticResources { + private final long startTime = LocalDateTime.now().toEpochSecond(ZoneOffset.UTC); + + @SneakyThrows + public void serveStatic(String domain, String path, Request req, Response rsp) { + try { + ClassPathResource resource = new ClassPathResource("static/" + domain + "/" + path); + handleEtagStatic(resource, req, rsp); + resource.getInputStream().transferTo(rsp.raw().getOutputStream()); + } + catch (IllegalArgumentException | FileNotFoundException ex) { + Spark.halt(404); + } + } + + @SneakyThrows + private void handleEtagStatic(ClassPathResource resource, Request req, Response rsp) { + rsp.header("Cache-Control", "public,max-age=3600"); + rsp.type(MimeType.fromResource(resource)); + + final String etag = staticResourceEtag(resource.getFilename()); + + if (etag.equals(req.headers("If-None-Match"))) { + Spark.halt(304); + } + + rsp.header("ETag", etag); + } + + private String staticResourceEtag(String resource) { + return "\"" + resource.hashCode() + "-" + startTime + "\""; + } +} diff --git a/marginalia_nu/src/main/resources/static/edge/style-new.css b/marginalia_nu/src/main/resources/static/edge/style-new.css index fbe7cfb5..2d7f9acd 100644 --- a/marginalia_nu/src/main/resources/static/edge/style-new.css +++ b/marginalia_nu/src/main/resources/static/edge/style-new.css @@ -7,6 +7,12 @@ body { background-color: #f8f8ee; } +.rightbox { + float: right; + display: block; + max-width: 40ch; + clear: both; +} .sticker { ruby-position: under; @@ -70,6 +76,9 @@ ul.semantic-results a { article > section > p { display: none; } +.cards.big .card { flex-grow: 1 } +.cards.big { padding-right: 1ch; } + .w3m-helper { display: none; } @@ -296,6 +305,7 @@ select { } footer { + clear: both; padding: 2ch; margin: 16ch 0px 0px 0px; background-color: #acae89; @@ -337,7 +347,7 @@ a.underline { } @media only screen and (max-device-width: 1024px) { - + .rightbox { width: 30ch !important; } .card { margin-right: 2ch; } @@ -355,6 +365,7 @@ a.underline { } @media only screen and (max-device-width: 800px) { + .rightbox { display: none; } .search-box { flex-direction: column; } diff --git a/marginalia_nu/src/main/resources/templates/edge/browse-result-rb.hdb b/marginalia_nu/src/main/resources/templates/edge/browse-result-rb.hdb new file mode 100644 index 00000000..0a0beb8f --- /dev/null +++ b/marginalia_nu/src/main/resources/templates/edge/browse-result-rb.hdb @@ -0,0 +1,12 @@ +
+

{{url.domain}}

+ + + + + + +
\ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/index.hdb b/marginalia_nu/src/main/resources/templates/edge/index.hdb new file mode 100644 index 00000000..c725054d --- /dev/null +++ b/marginalia_nu/src/main/resources/templates/edge/index.hdb @@ -0,0 +1,130 @@ + + + + + Marginalia Search} + + + + + + + + + + + + + + + + + +{{>edge/parts/search-header}} + +
+{{>edge/parts/search-form}} + +
+

Publicity, Discussion and Events

+
+
+
Kritik an Googles Suche - Platzhirsch auf dem Nebenschauplatz
+
Deutschlandfunk Kultur 🇩🇪, 2022-08-18
+
Marginalia Goes Open Source
+
Hacker News, 2022-05-28
+
You Should Check Out the Indie Web 🎞️
+
YouTube, You've Got Kat, 2022-03-15
+
+ What Google Search Isn't Showing You +
+
The New Yorker 🎩, 2022-03-10
+
+ Marginalia Search - Serendipity Engineering +
+
MetaFilter, 2022-03-09
+
+ 🎂 First anniversary! 🎊 +
+
+ 2022-02-26 +
+
+ A Search Engine Designed To Surprise You +
+
Clive Thompson OneZero, 2021-09-16
+
+ A search engine that favors text-heavy sites and punishes modern web design +
+
+ Hacker News, 2021-09-16 +
+
+
+
+ +
+
+

About

+
+

This is an independent DIY search engine that focuses on non-commercial content, and attempts to + show you sites you perhaps weren't aware of in favor of the sort of sites you probably already knew + existed.

+

+ The software for this search engine is all custom-built, and all crawling and indexing is + done in-house. The project is open source. Feel free to poke about in the source code or contribute + to the development! +

+

Consider supporting the + project!

+
+
+ Read More +
+
+ +
+

Tips

+
+

+ This search engine isn't particularly well equipped to answering queries + posed like questions, instead try to imagine some text that might appear + in the website you are looking for, and search for that.

+

+ Where this search engine really shines is finding small, old and obscure websites about some + given topic, perhaps + old video games, + a mystery, + theology, + the occult, + knitting, + computer science, + or art. +

+ +
+ +
+ + +
+

Updates

+
+

☛ A recipe filter has been added to the algorithm selector.

+

☛ The Random Mode has been overhauled, and is + quite entertaining. I encourage you to give it a spin.

+

☛ A simple public API is now available.

+
+ +
+ +
+
+ +{{>edge/parts/search-footer}} + diff --git a/marginalia_nu/src/main/resources/templates/edge/search-results.hdb b/marginalia_nu/src/main/resources/templates/edge/search-results.hdb index 19688a48..0572c779 100644 --- a/marginalia_nu/src/main/resources/templates/edge/search-results.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/search-results.hdb @@ -18,7 +18,9 @@
{{>edge/parts/search-form}} -
+ +{{#each domainResults}}{{>edge/browse-result-rb}}{{/each}} +
{{#if maintenanceMessage}}

Maintenance

{{maintenanceMessage}}

{{/if}} {{#if evalResult}}

Evaluation

{{query}} = {{evalResult}}


{{/if}} @@ -37,7 +39,6 @@
{{/if}} - {{#each domainResults}}{{>edge/browse-result}}{{/each}} {{#each results}}{{>edge/search-result}}{{/each}} {{#unless evalResult}}{{#if problems}}

Suggestions

    {{#each problems}}
  • {{{.}}}
  • {{/each}}
{{/if}}{{/unless}} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java index 1fba5fe3..a59726d6 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java @@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.crawling; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher; import nu.marginalia.wmsa.edge.crawling.retreival.HttpRedirectResolver; +import nu.marginalia.wmsa.edge.crawling.retreival.RateLimitException; import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.junit.jupiter.api.Assertions; @@ -27,14 +28,14 @@ class HttpFetcherTest { } @Test - void fetchUTF8() throws URISyntaxException { + void fetchUTF8() throws URISyntaxException, RateLimitException { var fetcher = new HttpFetcher("nu.marginalia.edge-crawler"); var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu")); System.out.println(str.contentType); } @Test - void fetchText() throws URISyntaxException { + void fetchText() throws URISyntaxException, RateLimitException { var fetcher = new HttpFetcher("nu.marginalia.edge-crawler"); var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt")); System.out.println(str); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/resource_store/ResourceStoreServiceTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/resource_store/ResourceStoreServiceTest.java index 78ae2215..4325cff6 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/resource_store/ResourceStoreServiceTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/resource_store/ResourceStoreServiceTest.java @@ -14,7 +14,6 @@ import org.slf4j.LoggerFactory; import spark.Spark; import java.io.File; -import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.time.LocalDateTime; @@ -41,7 +40,7 @@ class ResourceStoreServiceTest { tempDir = Files.createTempDirectory("ResourceStoreServiceTest"); resourceStore = new ResourceEntityStore(tempDir); service = new ResourceStoreService("127.0.0.1", testPort, null, - resourceStore, new Initialization(), null); + resourceStore, new Initialization(), null, new StaticResources()); Spark.awaitInitialization(); }