Serve assets from search service instead of resource-store,

dynamically render index for future goodies,
css tweaks.
This commit is contained in:
vlofgren 2022-08-24 00:35:22 +02:00
parent db4cf70784
commit ee0580273e
10 changed files with 255 additions and 33 deletions

View File

@ -3,7 +3,7 @@ server {
listen [::]:80;
server_name nginx;
location /search {
location / {
if ( $request_method = POST ) {
return 444;
}
@ -14,12 +14,7 @@ server {
proxy_set_header X-Extern-Domain $scheme://$host;
proxy_set_header X-User-Agent $http_user_agent;
proxy_pass http://edge-search:5023/public/search;
tcp_nodelay on;
}
location / {
proxy_pass http://edge-search:5023/;
proxy_pass http://edge-search:5023/public/;
tcp_nodelay on;
}
}

View File

@ -0,0 +1,40 @@
package nu.marginalia.wmsa.edge.search.command;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.search.model.BrowseResultSet;
import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
import spark.Request;
import spark.Response;
import java.io.IOException;
@Singleton
public class IndexCommand {
private final EdgeDataStoreDao dataStoreDao;
private final BrowseResultCleaner browseResultCleaner;
private final MustacheRenderer<BrowseResultSet> template;
private final EdgeDomainBlacklist blacklist;
@Inject
public IndexCommand(EdgeDataStoreDao dataStoreDao, RendererFactory rendererFactory, BrowseResultCleaner browseResultCleaner, EdgeDomainBlacklist blacklist) throws IOException {
this.dataStoreDao = dataStoreDao;
this.browseResultCleaner = browseResultCleaner;
template = rendererFactory.renderer("edge/index");
this.blacklist = blacklist;
}
public String render(Request request, Response response) {
response.header("Cache-control", "public,max-age=3600");
var results = dataStoreDao.getRandomDomains(5, blacklist, 0);
results.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
return template.render(new BrowseResultSet(results.stream().limit(1).toList()));
}
}

View File

@ -22,7 +22,6 @@ import spark.Spark;
import spark.resource.ClassPathResource;
import spark.staticfiles.MimeType;
import java.io.FileNotFoundException;
import java.net.URLEncoder;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
@ -35,6 +34,7 @@ public class ResourceStoreService extends Service {
private final AuthClient authClient;
private final ResourceEntityStore resourceStore;
private StaticResources staticResources;
@Inject
public ResourceStoreService(@Named("service-host") String ip,
@ -42,11 +42,13 @@ public class ResourceStoreService extends Service {
AuthClient authClient,
ResourceEntityStore resourceStore,
Initialization initialization,
MetricsServer metricsServer
MetricsServer metricsServer,
StaticResources staticResources
) {
super(ip, port, initialization, metricsServer);
this.authClient = authClient;
this.resourceStore = resourceStore;
this.staticResources = staticResources;
Schedulers.io().schedulePeriodicallyDirect(resourceStore::reapStaleResources,
5, 5, TimeUnit.MINUTES);
@ -109,12 +111,9 @@ public class ResourceStoreService extends Service {
return serveDynamic(data, request, response);
}
else if (serveStatic(domain + "/" + resource, request, response)) {
logger.info("getResource({}/{}, static)", domain, resource);
}
else {
logger.info("Could not serve {}/{}", domain, resource);
Spark.halt(404, "Not Found");
logger.info("getResource({}/{}, static)", domain, resource);
staticResources.serveStatic(domain, resource, request, response);
}
return "";
}
@ -138,19 +137,7 @@ public class ResourceStoreService extends Service {
return data.data;
}
@SneakyThrows
private boolean serveStatic(String path, Request req, Response rsp) {
try {
ClassPathResource resource = new ClassPathResource("static/" + path);
handleEtagStatic(resource, req, rsp);
resource.getInputStream().transferTo(rsp.raw().getOutputStream());
}
catch (IllegalArgumentException|FileNotFoundException ex) {
return false;
}
return true;
}
@SneakyThrows
private void handleEtag(RenderedResource page, Request req, Response rsp) {

View File

@ -0,0 +1,46 @@
package nu.marginalia.wmsa.resource_store;
import lombok.SneakyThrows;
import spark.Request;
import spark.Response;
import spark.Spark;
import spark.resource.ClassPathResource;
import spark.staticfiles.MimeType;
import java.io.FileNotFoundException;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
public class StaticResources {
private final long startTime = LocalDateTime.now().toEpochSecond(ZoneOffset.UTC);
@SneakyThrows
public void serveStatic(String domain, String path, Request req, Response rsp) {
try {
ClassPathResource resource = new ClassPathResource("static/" + domain + "/" + path);
handleEtagStatic(resource, req, rsp);
resource.getInputStream().transferTo(rsp.raw().getOutputStream());
}
catch (IllegalArgumentException | FileNotFoundException ex) {
Spark.halt(404);
}
}
@SneakyThrows
private void handleEtagStatic(ClassPathResource resource, Request req, Response rsp) {
rsp.header("Cache-Control", "public,max-age=3600");
rsp.type(MimeType.fromResource(resource));
final String etag = staticResourceEtag(resource.getFilename());
if (etag.equals(req.headers("If-None-Match"))) {
Spark.halt(304);
}
rsp.header("ETag", etag);
}
private String staticResourceEtag(String resource) {
return "\"" + resource.hashCode() + "-" + startTime + "\"";
}
}

View File

@ -7,6 +7,12 @@ body {
background-color: #f8f8ee;
}
.rightbox {
float: right;
display: block;
max-width: 40ch;
clear: both;
}
.sticker {
ruby-position: under;
@ -70,6 +76,9 @@ ul.semantic-results a {
article > section > p { display: none; }
.cards.big .card { flex-grow: 1 }
.cards.big { padding-right: 1ch; }
.w3m-helper {
display: none;
}
@ -296,6 +305,7 @@ select {
}
footer {
clear: both;
padding: 2ch;
margin: 16ch 0px 0px 0px;
background-color: #acae89;
@ -337,7 +347,7 @@ a.underline {
}
@media only screen and (max-device-width: 1024px) {
.rightbox { width: 30ch !important; }
.card {
margin-right: 2ch;
}
@ -355,6 +365,7 @@ a.underline {
}
@media only screen and (max-device-width: 800px) {
.rightbox { display: none; }
.search-box {
flex-direction: column;
}

View File

@ -0,0 +1,12 @@
<section class="card browse-result rightbox">
<h2>{{url.domain}}</h2>
<a href="{{url.proto}}://{{url.domain}}/">
<img src="/screenshot/{{domainId}}" title="{{description}}" loading="lazy"/>
</a>
<div class="utils">
<a href="/site/{{url.domain}}">Info</a>
<a href="/explore/{{url.domain}}">Similar Domains</a>
</div>
</section>

View File

@ -0,0 +1,130 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Marginalia Search}</title>
<link rel="stylesheet" href="/style-new.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="robots" content="noindex" />
<meta property="og:description" content="search.marginalia.nu is a small independent do-it-yourself search engine for surprising but content-rich websites that never ask you to accept cookies or subscribe to newsletters. The goal is to bring you the sort of grass fed, free range HTML your grandma used to write. " />
<meta property="og:locale" content="en_US" />
<meta property="og:site_name" content="search.marginalia.nu" />
<meta property="og:type" content="website" />
<meta property="og:url" content="https://search.marginalia.nu/" />
</head>
<body>
<!-- Hi there, fellow human being :-) -->
{{>edge/parts/search-header}}
<article>
{{>edge/parts/search-form}}
<section class="card rightbox">
<h2>Publicity, Discussion and Events</h2>
<div class="info">
<dl>
<dt><a href="https://www.deutschlandfunkkultur.de/google-suche-100.html">Kritik an Googles Suche - Platzhirsch auf dem Nebenschauplatz</a></dt>
<dd>Deutschlandfunk Kultur &#x1f1e9;&#x1f1ea;, 2022-08-18</dd>
<dt><a href="https://news.ycombinator.com/item?id=31536626" rel="nofollow">Marginalia Goes Open Source</a></dt>
<dd>Hacker News, 2022-05-28</dd>
<dt><a href="https://www.youtube.com/watch?v=rTSEr0cRJY8" rel="nofollow">You Should Check Out the Indie Web</a> &#x1F39E;&#xFE0F;</dt>
<dd>YouTube, You've Got Kat, 2022-03-15 </dd>
<dt>
<a href="https://www.newyorker.com/culture/infinite-scroll/what-google-search-isnt-showing-you" rel="nofollow">What Google Search Isn't Showing You</a>
</dt>
<dd>The New Yorker &#127913;, 2022-03-10</dd>
<dt>
<a href="https://www.metafilter.com/194653/Marginalia-Search-Serendipity-Engineering" rel="nofollow">Marginalia Search - Serendipity Engineering</a>
</dt>
<dd>MetaFilter, 2022-03-09</dd>
<dt>
&#127874; <a href="https://memex.marginalia.nu/log/49-marginalia-1-year.gmi">First anniversary</a>! &#127882;
</dt>
<dd>
2022-02-26
</dd>
<dt>
<a href="https://onezero.medium.com/a-search-engine-designed-to-surprise-you-b81944ed5c06" rel="nofollow">A Search Engine Designed To Surprise You</a>
</dt>
<dd>Clive Thompson OneZero, 2021-09-16</dd>
<dt>
<a href="https://news.ycombinator.com/item?id=28550764" rel="nofollow"> A search engine that favors text-heavy sites and punishes modern web design</a>
</dt>
<dd>
Hacker News, 2021-09-16
</dd>
</dl>
</div>
</section>
<div class="cards big">
<section class="card">
<h2>About</h2>
<div class="info">
<p>This is an independent DIY search engine that focuses on non-commercial content, and attempts to
show you sites you perhaps weren't aware of in favor of the sort of sites you probably already knew
existed. </p>
<p>
The software for this search engine is all custom-built, and all crawling and indexing is
done in-house. The project is open source. Feel free to poke about in the <a
href="https://git.marginalia.nu/marginalia/marginalia.nu">source code</a> or contribute
to the development!
</p>
<p>Consider <a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">supporting the
project</a>!</p>
</div>
<div class="utils">
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">Read More</a>
</div>
</section>
<section class="card">
<h2>Tips</h2>
<div class="info">
<p>
This search engine isn't particularly well equipped to answering queries
posed like questions, instead try to imagine some text that might appear
in the website you are looking for, and search for that.</p>
<p>
Where this search engine really shines is finding small, old and obscure websites about some
given topic, perhaps
<a href="https://search.marginalia.nu/search?query=commander+keen&profile=yolo&js=default">old video games</a>,
<a href="https://search.marginalia.nu/search?query=voynich+&profile=yolo&js=default">a mystery</a>,
<a href="https://search.marginalia.nu/search?query=augustine+confessions&profile=yolo&js=default">theology</a>,
<a href="https://search.marginalia.nu/search?query=Hermes+Trismegistus&profile=yolo&js=default">the occult</a>,
<a href="https://search.marginalia.nu/search?query=knitting&profile=yolo&js=default">knitting</a>,
<a href="https://search.marginalia.nu/search?query=scc+graph+algorithm&profile=yolo&js=default">computer science</a>,
or <a href="https://search.marginalia.nu/search?query=salvador+dali&profile=yolo&js=default">art</a>.
</p>
</div>
<div class="utils">
<a href="https://memex.marginalia.nu/projects/edge/search-tips.gmi">Additional Tips</a>
</div>
</section>
<section class="card">
<h2>Updates</h2>
<div class="info">
<p>☛ A recipe filter has been added to the algorithm selector.</p>
<p>☛ The <a href="https://search.marginalia.nu/explore/random">Random Mode</a> has been overhauled, and is
quite entertaining. I encourage you to give it a spin. </p>
<p>☛ A simple <a href="https://api.marginalia.nu/">public API</a> is now available.</p>
</div>
<div class="utils">
<a href="https://memex.marginalia.nu/projects/edge/changelog.gmi">Change Log</a>
</div>
</section>
</div>
</article>
{{>edge/parts/search-footer}}
</body>

View File

@ -18,7 +18,9 @@
<article>
{{>edge/parts/search-form}}
<hr class="w3m-helper" />
{{#each domainResults}}{{>edge/browse-result-rb}}{{/each}}
<section class="cards">
{{#if maintenanceMessage}}<section class="card problems onlyscreen"><h2>Maintenance</h2><p class="description">{{maintenanceMessage}}</p></section>{{/if}}
{{#if evalResult}}<section class="card semantic onlyscreen"><h2>Evaluation</h2><p class="description">{{query}} = {{evalResult}}</p><hr class="w3m-helper" /></section>{{/if}}
@ -37,7 +39,6 @@
</section>
{{/if}}
{{#each domainResults}}{{>edge/browse-result}}{{/each}}
{{#each results}}{{>edge/search-result}}{{/each}}
{{#unless evalResult}}{{#if problems}}<section class="card problems onlyscreen"><h2>Suggestions</h2><ul class="onlyscreen search-problems">{{#each problems}}<li>{{{.}}}</li>{{/each}}</ul></section> {{/if}}{{/unless}}

View File

@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.crawling;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher;
import nu.marginalia.wmsa.edge.crawling.retreival.HttpRedirectResolver;
import nu.marginalia.wmsa.edge.crawling.retreival.RateLimitException;
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.junit.jupiter.api.Assertions;
@ -27,14 +28,14 @@ class HttpFetcherTest {
}
@Test
void fetchUTF8() throws URISyntaxException {
void fetchUTF8() throws URISyntaxException, RateLimitException {
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler");
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"));
System.out.println(str.contentType);
}
@Test
void fetchText() throws URISyntaxException {
void fetchText() throws URISyntaxException, RateLimitException {
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler");
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
System.out.println(str);

View File

@ -14,7 +14,6 @@ import org.slf4j.LoggerFactory;
import spark.Spark;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDateTime;
@ -41,7 +40,7 @@ class ResourceStoreServiceTest {
tempDir = Files.createTempDirectory("ResourceStoreServiceTest");
resourceStore = new ResourceEntityStore(tempDir);
service = new ResourceStoreService("127.0.0.1", testPort, null,
resourceStore, new Initialization(), null);
resourceStore, new Initialization(), null, new StaticResources());
Spark.awaitInitialization();
}