Merge pull request 'master' (#99) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/99
This commit is contained in:
Viktor Lofgren 2022-08-24 00:39:10 +02:00
commit 2f8cab7f0e
16 changed files with 474 additions and 98 deletions

View File

@ -3,7 +3,7 @@ server {
listen [::]:80;
server_name nginx;
location /search {
location / {
if ( $request_method = POST ) {
return 444;
}
@ -14,12 +14,7 @@ server {
proxy_set_header X-Extern-Domain $scheme://$host;
proxy_set_header X-User-Agent $http_user_agent;
proxy_pass http://edge-search:5023/public/search;
tcp_nodelay on;
}
location / {
proxy_pass http://edge-search:5023/;
proxy_pass http://edge-search:5023/public/;
tcp_nodelay on;
}
}

View File

@ -5,6 +5,8 @@ import com.google.inject.Singleton;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.HashSet;
import java.util.List;
@ -41,14 +43,17 @@ public class FeatureExtractor {
}
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
Set<HtmlFeature> features = new HashSet<>();
final Set<HtmlFeature> features = new HashSet<>();
var scriptTags = doc.getElementsByTag("script");
final Elements scriptTags = doc.getElementsByTag("script");
if (scriptTags.size() > 0) {
features.add(HtmlFeature.JS);
for (var scriptTag : scriptTags) {
if (isJavascriptTag(scriptTag)) {
features.add(HtmlFeature.JS);
}
}
else if(adblockSimulator.hasAds(doc.clone())) { // Only look for ads if there is javascript
if (features.contains(HtmlFeature.JS) && adblockSimulator.hasAds(doc.clone())) {
features.add(HtmlFeature.ADVERTISEMENT);
}
@ -58,20 +63,22 @@ public class FeatureExtractor {
features.add(HtmlFeature.MEDIA);
}
if (scriptTags.stream()
.anyMatch(tag -> trackers.stream().anyMatch(tracker -> tag.attr("src").contains(tracker)))) {
features.add(HtmlFeature.TRACKING);
for (var scriptTag : scriptTags) {
if (hasTrackingScript(scriptTag)) {
features.add(HtmlFeature.TRACKING);
break;
}
}
if (scriptTags.html().contains("google-analytics.com")) {
features.add(HtmlFeature.TRACKING);
}
if (doc.getElementsByTag("a").stream().map(e -> e.attr("href"))
.map(String::toLowerCase)
.anyMatch(href ->
href.contains("amzn.to/") || (href.contains("amazon.com/") & href.contains("tag=")))) {
features.add(HtmlFeature.AFFILIATE_LINK);
for (var aTag : doc.getElementsByTag("a")) {
if (isAmazonAffiliateLink(aTag)) {
features.add(HtmlFeature.AFFILIATE_LINK);
break;
}
}
if (!domain.cookies.isEmpty()) {
@ -80,4 +87,34 @@ public class FeatureExtractor {
return features;
}
private boolean hasTrackingScript(Element scriptTag) {
for (var tracker : trackers) {
if (scriptTag.attr("src").contains(tracker)) {
return true;
}
}
return false;
}
private boolean isJavascriptTag(Element scriptTag) {
final String type = scriptTag.attr("type");
if ("application/ld+json".equalsIgnoreCase(type)) {
return false;
}
return true;
}
boolean isAmazonAffiliateLink(Element aTag) {
final String href = aTag.attr("href").toLowerCase();
if (href.contains("amzn.to/"))
return true;
if (href.contains("amazon.com/") && href.contains("tag="))
return true;
return false;
}
}

View File

@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import okhttp3.ConnectionPool;
import okhttp3.Dispatcher;
import okhttp3.internal.Util;
import org.slf4j.Logger;
@ -25,12 +26,14 @@ public class CrawlerMain implements AutoCloseable {
private final WorkLog workLog;
private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS);
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
private final UserAgent userAgent;
private final ThreadPoolExecutor pool;
final int poolSize = 256;
final int poolSize = 512;
final int poolQueueSize = 32;
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
@ -67,9 +70,10 @@ public class CrawlerMain implements AutoCloseable {
if (workLog.isJobFinished(specification.id))
return;
var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher);
try (var writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id))
{
var retreiver = new CrawlerRetreiver(fetcher, specification, writer);
int size = retreiver.fetch();
@ -92,7 +96,6 @@ public class CrawlerMain implements AutoCloseable {
AbortMonitor abortMonitor = AbortMonitor.getInstance();
Semaphore taskSem = new Semaphore(poolSize);
plan.forEachCrawlingSpecification(spec -> {

View File

@ -17,7 +17,6 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URISyntaxException;
import java.net.UnknownHostException;
import java.time.LocalDateTime;
import java.util.ArrayList;
@ -25,12 +24,19 @@ import java.util.HashSet;
import java.util.LinkedList;
import java.util.Optional;
import static java.lang.Math.max;
import static java.lang.Math.min;
public class CrawlerRetreiver {
private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000);
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 250);
private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
private final HttpFetcher fetcher;
private final HashSet<EdgeUrl> visited;
private final HashSet<EdgeUrl> known;
private final HashSet<String> visited;
private final HashSet<String> known;
private boolean slowDown = false;
private final int depth;
private final String id;
@ -64,15 +70,13 @@ public class CrawlerRetreiver {
crawledDomainWriter = writer;
for (String urlStr : specs.urls) {
EdgeUrl.parse(urlStr)
.filter(known::add)
.ifPresent(queue::addLast);
EdgeUrl.parse(urlStr).ifPresent(this::addToQueue);
}
if (queue.peek() != null) {
var fst = queue.peek();
var root = fst.domain.toRootUrl();
if (known.add(root))
if (known.add(root.toString()))
queue.addFirst(root);
}
}
@ -147,7 +151,7 @@ public class CrawlerRetreiver {
continue;
if (top.toString().length() > 255)
continue;
if (!visited.add(top))
if (!visited.add(top.toString()))
continue;
if (fetchDocument(top, crawlDelay)) {
@ -172,9 +176,7 @@ public class CrawlerRetreiver {
crawledDomainWriter.accept(d);
if (d.url != null) {
try {
visited.add(new EdgeUrl(d.url));
} catch (URISyntaxException ex) {}
EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add);
}
}
@ -192,8 +194,7 @@ public class CrawlerRetreiver {
private Optional<CrawledDocument> fetchUrl(EdgeUrl top) {
try {
var doc = fetcher.fetchContent(top);
var doc = fetchContent(top);
if (doc.documentBody != null) {
@ -217,6 +218,24 @@ public class CrawlerRetreiver {
}
@SneakyThrows
private CrawledDocument fetchContent(EdgeUrl top) {
for (int i = 0; i < 2; i++) {
try {
return fetcher.fetchContent(top);
}
catch (RateLimitException ex) {
slowDown = true;
int delay = ex.retryAfter();
if (delay > 0 && delay < 5000) {
Thread.sleep(delay);
}
}
}
return createRetryError(top);
}
private String createHash(String documentBodyHash) {
return hashMethod.hashUnencodedChars(documentBodyHash).toString();
}
@ -235,28 +254,29 @@ public class CrawlerRetreiver {
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
for (var link : parsed.getElementsByTag("a")) {
linkParser.parseLink(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue);
}
for (var link : parsed.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
}
for (var link : parsed.getElementsByTag("iframe")) {
linkParser.parseFrame(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
}
}
private void addToQueue(EdgeUrl url) {
if (!isSameDomain(url))
return;
if (urlBlocklist.isUrlBlocked(url))
return;
if (urlBlocklist.isMailingListLink(url))
return;
// reduce memory usage by not growing queue huge when crawling large sites
if (queue.size() + visited.size() >= depth + 100)
return;
if (known.add(url.toString())) {
queue.addLast(url);
}
}
@ -284,13 +304,24 @@ public class CrawlerRetreiver {
if (spentTime > sleepTime)
return;
Thread.sleep(Math.min(sleepTime-spentTime, 5000));
Thread.sleep(min(sleepTime-spentTime, 5000));
}
else if (slowDown) {
Thread.sleep( 1000);
}
else {
if (spentTime > DEFAULT_CRAWL_DELAY_MS)
// When no crawl delay is specified, lean toward twice the fetch+process time,
// within sane limits. This means slower servers get slower crawling, and faster
// servers get faster crawling.
sleepTime = spentTime * 2;
sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS);
sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS);
if (spentTime > sleepTime)
return;
Thread.sleep(DEFAULT_CRAWL_DELAY_MS - spentTime);
Thread.sleep(sleepTime-spentTime);
}
}
@ -302,7 +333,14 @@ public class CrawlerRetreiver {
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
.build();
}
private CrawledDocument createRetryError(EdgeUrl url) {
return CrawledDocument.builder()
.url(url.toString())
.timestamp(LocalDateTime.now().toString())
.httpStatus(429)
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
.build();
}
private CrawledDomain createErrorPostFromStatus(HttpFetcher.FetchResult ret) {
String ip = findIp(domain);

View File

@ -0,0 +1,51 @@
package nu.marginalia.wmsa.edge.crawling.retreival;
import javax.net.SocketFactory;
import java.io.IOException;
import java.net.InetAddress;
import java.net.Socket;
public class FastTerminatingSocketFactory extends SocketFactory {
private static final SocketFactory delegate = SocketFactory.getDefault();
private void configure(Socket sock) throws IOException {
// Setting SO_LINGER to enabled but low reduces TIME_WAIT
// which can get pretty... bad when you're crawling
// and opening thousands of connections
sock.setSoLinger(true, 3);
}
public Socket createSocket() throws IOException {
var sock = delegate.createSocket();
configure(sock);
return sock;
}
@Override
public Socket createSocket(String host, int port) throws IOException {
var sock = delegate.createSocket(host, port);
configure(sock);
return sock;
}
@Override
public Socket createSocket(String host, int port, InetAddress localHost, int localPort) throws IOException {
var sock = delegate.createSocket(host, port, localHost, localPort);
configure(sock);
return sock;
}
@Override
public Socket createSocket(InetAddress host, int port) throws IOException {
var sock = delegate.createSocket(host, port);
configure(sock);
return sock;
}
@Override
public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort) throws IOException {
var sock = delegate.createSocket(address, port, localAddress, localPort);
configure(sock);
return sock;
}
}

View File

@ -13,10 +13,7 @@ import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import okhttp3.Dispatcher;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import okhttp3.*;
import org.apache.commons.io.input.BOMInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -66,15 +63,18 @@ public class HttpFetcher {
}
}
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
@SneakyThrows
private OkHttpClient createClient(Dispatcher dispatcher) {
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
var builder = new OkHttpClient.Builder();
if (dispatcher != null) {
builder.dispatcher(dispatcher);
}
return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
.socketFactory(ftSocketFactory)
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
.connectionPool(pool)
.cookieJar(cookies.getJar())
.followRedirects(true)
.followSslRedirects(true)
@ -82,6 +82,7 @@ public class HttpFetcher {
.readTimeout(10, TimeUnit.SECONDS)
.writeTimeout(10, TimeUnit.SECONDS)
.build();
}
public List<String> getCookies() {
@ -93,13 +94,13 @@ public class HttpFetcher {
}
@Inject
public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher) {
this.client = createClient(dispatcher);
public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
this.client = createClient(dispatcher, connectionPool);
this.userAgent = userAgent;
}
public HttpFetcher(@Named("user-agent") String userAgent) {
this.client = createClient(null);
this.client = createClient(null, new ConnectionPool());
this.userAgent = userAgent;
}
@ -141,7 +142,7 @@ public class HttpFetcher {
}
@SneakyThrows
public CrawledDocument fetchContent(EdgeUrl url) {
public CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException {
if (contentTypeLogic.isUrlLikeBinary(url)) {
logger.debug("Probing suspected binary {}", url);
@ -192,13 +193,17 @@ public class HttpFetcher {
.build();
}
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException {
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException, RateLimitException {
var responseUrl = new EdgeUrl(rsp.request().url().toString());
if (!Objects.equals(responseUrl.domain, url.domain)) {
return createRedirectResponse(url, rsp, responseUrl);
}
if (rsp.code() == 429) {
throw new RateLimitException(rsp.header("Retry-After", "1000"));
}
var body = rsp.body();
if (null == body) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.ERROR, "No body");
@ -258,8 +263,6 @@ public class HttpFetcher {
}
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
return fetchRobotsForProto("https", domain)
.or(() -> fetchRobotsForProto("http", domain))
@ -282,4 +285,5 @@ public class HttpFetcher {
doc.contentType,
userAgent);
}
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.wmsa.edge.crawling.retreival;
public class RateLimitException extends Exception {
private final String retryAfter;
public RateLimitException(String retryAfter) {
this.retryAfter = retryAfter;
}
@Override
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
public int retryAfter() {
try {
return Integer.parseInt(retryAfter);
}
catch (NumberFormatException ex) {
return 1000;
}
}
}

View File

@ -0,0 +1,40 @@
package nu.marginalia.wmsa.edge.search.command;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.search.model.BrowseResultSet;
import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
import spark.Request;
import spark.Response;
import java.io.IOException;
@Singleton
public class IndexCommand {
private final EdgeDataStoreDao dataStoreDao;
private final BrowseResultCleaner browseResultCleaner;
private final MustacheRenderer<BrowseResultSet> template;
private final EdgeDomainBlacklist blacklist;
@Inject
public IndexCommand(EdgeDataStoreDao dataStoreDao, RendererFactory rendererFactory, BrowseResultCleaner browseResultCleaner, EdgeDomainBlacklist blacklist) throws IOException {
this.dataStoreDao = dataStoreDao;
this.browseResultCleaner = browseResultCleaner;
template = rendererFactory.renderer("edge/index");
this.blacklist = blacklist;
}
public String render(Request request, Response response) {
response.header("Cache-control", "public,max-age=3600");
var results = dataStoreDao.getRandomDomains(5, blacklist, 0);
results.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
return template.render(new BrowseResultSet(results.stream().limit(1).toList()));
}
}

View File

@ -22,7 +22,6 @@ import spark.Spark;
import spark.resource.ClassPathResource;
import spark.staticfiles.MimeType;
import java.io.FileNotFoundException;
import java.net.URLEncoder;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
@ -35,6 +34,7 @@ public class ResourceStoreService extends Service {
private final AuthClient authClient;
private final ResourceEntityStore resourceStore;
private StaticResources staticResources;
@Inject
public ResourceStoreService(@Named("service-host") String ip,
@ -42,11 +42,13 @@ public class ResourceStoreService extends Service {
AuthClient authClient,
ResourceEntityStore resourceStore,
Initialization initialization,
MetricsServer metricsServer
MetricsServer metricsServer,
StaticResources staticResources
) {
super(ip, port, initialization, metricsServer);
this.authClient = authClient;
this.resourceStore = resourceStore;
this.staticResources = staticResources;
Schedulers.io().schedulePeriodicallyDirect(resourceStore::reapStaleResources,
5, 5, TimeUnit.MINUTES);
@ -109,12 +111,9 @@ public class ResourceStoreService extends Service {
return serveDynamic(data, request, response);
}
else if (serveStatic(domain + "/" + resource, request, response)) {
logger.info("getResource({}/{}, static)", domain, resource);
}
else {
logger.info("Could not serve {}/{}", domain, resource);
Spark.halt(404, "Not Found");
logger.info("getResource({}/{}, static)", domain, resource);
staticResources.serveStatic(domain, resource, request, response);
}
return "";
}
@ -138,19 +137,7 @@ public class ResourceStoreService extends Service {
return data.data;
}
@SneakyThrows
private boolean serveStatic(String path, Request req, Response rsp) {
try {
ClassPathResource resource = new ClassPathResource("static/" + path);
handleEtagStatic(resource, req, rsp);
resource.getInputStream().transferTo(rsp.raw().getOutputStream());
}
catch (IllegalArgumentException|FileNotFoundException ex) {
return false;
}
return true;
}
@SneakyThrows
private void handleEtag(RenderedResource page, Request req, Response rsp) {

View File

@ -0,0 +1,46 @@
package nu.marginalia.wmsa.resource_store;
import lombok.SneakyThrows;
import spark.Request;
import spark.Response;
import spark.Spark;
import spark.resource.ClassPathResource;
import spark.staticfiles.MimeType;
import java.io.FileNotFoundException;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
public class StaticResources {
private final long startTime = LocalDateTime.now().toEpochSecond(ZoneOffset.UTC);
@SneakyThrows
public void serveStatic(String domain, String path, Request req, Response rsp) {
try {
ClassPathResource resource = new ClassPathResource("static/" + domain + "/" + path);
handleEtagStatic(resource, req, rsp);
resource.getInputStream().transferTo(rsp.raw().getOutputStream());
}
catch (IllegalArgumentException | FileNotFoundException ex) {
Spark.halt(404);
}
}
@SneakyThrows
private void handleEtagStatic(ClassPathResource resource, Request req, Response rsp) {
rsp.header("Cache-Control", "public,max-age=3600");
rsp.type(MimeType.fromResource(resource));
final String etag = staticResourceEtag(resource.getFilename());
if (etag.equals(req.headers("If-None-Match"))) {
Spark.halt(304);
}
rsp.header("ETag", etag);
}
private String staticResourceEtag(String resource) {
return "\"" + resource.hashCode() + "-" + startTime + "\"";
}
}

View File

@ -7,6 +7,12 @@ body {
background-color: #f8f8ee;
}
.rightbox {
float: right;
display: block;
max-width: 40ch;
clear: both;
}
.sticker {
ruby-position: under;
@ -70,6 +76,9 @@ ul.semantic-results a {
article > section > p { display: none; }
.cards.big .card { flex-grow: 1 }
.cards.big { padding-right: 1ch; }
.w3m-helper {
display: none;
}
@ -296,6 +305,7 @@ select {
}
footer {
clear: both;
padding: 2ch;
margin: 16ch 0px 0px 0px;
background-color: #acae89;
@ -337,7 +347,7 @@ a.underline {
}
@media only screen and (max-device-width: 1024px) {
.rightbox { width: 30ch !important; }
.card {
margin-right: 2ch;
}
@ -355,6 +365,7 @@ a.underline {
}
@media only screen and (max-device-width: 800px) {
.rightbox { display: none; }
.search-box {
flex-direction: column;
}

View File

@ -0,0 +1,12 @@
<section class="card browse-result rightbox">
<h2>{{url.domain}}</h2>
<a href="{{url.proto}}://{{url.domain}}/">
<img src="/screenshot/{{domainId}}" title="{{description}}" loading="lazy"/>
</a>
<div class="utils">
<a href="/site/{{url.domain}}">Info</a>
<a href="/explore/{{url.domain}}">Similar Domains</a>
</div>
</section>

View File

@ -0,0 +1,130 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Marginalia Search}</title>
<link rel="stylesheet" href="/style-new.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="robots" content="noindex" />
<meta property="og:description" content="search.marginalia.nu is a small independent do-it-yourself search engine for surprising but content-rich websites that never ask you to accept cookies or subscribe to newsletters. The goal is to bring you the sort of grass fed, free range HTML your grandma used to write. " />
<meta property="og:locale" content="en_US" />
<meta property="og:site_name" content="search.marginalia.nu" />
<meta property="og:type" content="website" />
<meta property="og:url" content="https://search.marginalia.nu/" />
</head>
<body>
<!-- Hi there, fellow human being :-) -->
{{>edge/parts/search-header}}
<article>
{{>edge/parts/search-form}}
<section class="card rightbox">
<h2>Publicity, Discussion and Events</h2>
<div class="info">
<dl>
<dt><a href="https://www.deutschlandfunkkultur.de/google-suche-100.html">Kritik an Googles Suche - Platzhirsch auf dem Nebenschauplatz</a></dt>
<dd>Deutschlandfunk Kultur &#x1f1e9;&#x1f1ea;, 2022-08-18</dd>
<dt><a href="https://news.ycombinator.com/item?id=31536626" rel="nofollow">Marginalia Goes Open Source</a></dt>
<dd>Hacker News, 2022-05-28</dd>
<dt><a href="https://www.youtube.com/watch?v=rTSEr0cRJY8" rel="nofollow">You Should Check Out the Indie Web</a> &#x1F39E;&#xFE0F;</dt>
<dd>YouTube, You've Got Kat, 2022-03-15 </dd>
<dt>
<a href="https://www.newyorker.com/culture/infinite-scroll/what-google-search-isnt-showing-you" rel="nofollow">What Google Search Isn't Showing You</a>
</dt>
<dd>The New Yorker &#127913;, 2022-03-10</dd>
<dt>
<a href="https://www.metafilter.com/194653/Marginalia-Search-Serendipity-Engineering" rel="nofollow">Marginalia Search - Serendipity Engineering</a>
</dt>
<dd>MetaFilter, 2022-03-09</dd>
<dt>
&#127874; <a href="https://memex.marginalia.nu/log/49-marginalia-1-year.gmi">First anniversary</a>! &#127882;
</dt>
<dd>
2022-02-26
</dd>
<dt>
<a href="https://onezero.medium.com/a-search-engine-designed-to-surprise-you-b81944ed5c06" rel="nofollow">A Search Engine Designed To Surprise You</a>
</dt>
<dd>Clive Thompson OneZero, 2021-09-16</dd>
<dt>
<a href="https://news.ycombinator.com/item?id=28550764" rel="nofollow"> A search engine that favors text-heavy sites and punishes modern web design</a>
</dt>
<dd>
Hacker News, 2021-09-16
</dd>
</dl>
</div>
</section>
<div class="cards big">
<section class="card">
<h2>About</h2>
<div class="info">
<p>This is an independent DIY search engine that focuses on non-commercial content, and attempts to
show you sites you perhaps weren't aware of in favor of the sort of sites you probably already knew
existed. </p>
<p>
The software for this search engine is all custom-built, and all crawling and indexing is
done in-house. The project is open source. Feel free to poke about in the <a
href="https://git.marginalia.nu/marginalia/marginalia.nu">source code</a> or contribute
to the development!
</p>
<p>Consider <a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">supporting the
project</a>!</p>
</div>
<div class="utils">
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">Read More</a>
</div>
</section>
<section class="card">
<h2>Tips</h2>
<div class="info">
<p>
This search engine isn't particularly well equipped to answering queries
posed like questions, instead try to imagine some text that might appear
in the website you are looking for, and search for that.</p>
<p>
Where this search engine really shines is finding small, old and obscure websites about some
given topic, perhaps
<a href="https://search.marginalia.nu/search?query=commander+keen&profile=yolo&js=default">old video games</a>,
<a href="https://search.marginalia.nu/search?query=voynich+&profile=yolo&js=default">a mystery</a>,
<a href="https://search.marginalia.nu/search?query=augustine+confessions&profile=yolo&js=default">theology</a>,
<a href="https://search.marginalia.nu/search?query=Hermes+Trismegistus&profile=yolo&js=default">the occult</a>,
<a href="https://search.marginalia.nu/search?query=knitting&profile=yolo&js=default">knitting</a>,
<a href="https://search.marginalia.nu/search?query=scc+graph+algorithm&profile=yolo&js=default">computer science</a>,
or <a href="https://search.marginalia.nu/search?query=salvador+dali&profile=yolo&js=default">art</a>.
</p>
</div>
<div class="utils">
<a href="https://memex.marginalia.nu/projects/edge/search-tips.gmi">Additional Tips</a>
</div>
</section>
<section class="card">
<h2>Updates</h2>
<div class="info">
<p>☛ A recipe filter has been added to the algorithm selector.</p>
<p>☛ The <a href="https://search.marginalia.nu/explore/random">Random Mode</a> has been overhauled, and is
quite entertaining. I encourage you to give it a spin. </p>
<p>☛ A simple <a href="https://api.marginalia.nu/">public API</a> is now available.</p>
</div>
<div class="utils">
<a href="https://memex.marginalia.nu/projects/edge/changelog.gmi">Change Log</a>
</div>
</section>
</div>
</article>
{{>edge/parts/search-footer}}
</body>

View File

@ -18,7 +18,9 @@
<article>
{{>edge/parts/search-form}}
<hr class="w3m-helper" />
{{#each domainResults}}{{>edge/browse-result-rb}}{{/each}}
<section class="cards">
{{#if maintenanceMessage}}<section class="card problems onlyscreen"><h2>Maintenance</h2><p class="description">{{maintenanceMessage}}</p></section>{{/if}}
{{#if evalResult}}<section class="card semantic onlyscreen"><h2>Evaluation</h2><p class="description">{{query}} = {{evalResult}}</p><hr class="w3m-helper" /></section>{{/if}}
@ -37,7 +39,6 @@
</section>
{{/if}}
{{#each domainResults}}{{>edge/browse-result}}{{/each}}
{{#each results}}{{>edge/search-result}}{{/each}}
{{#unless evalResult}}{{#if problems}}<section class="card problems onlyscreen"><h2>Suggestions</h2><ul class="onlyscreen search-problems">{{#each problems}}<li>{{{.}}}</li>{{/each}}</ul></section> {{/if}}{{/unless}}

View File

@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.crawling;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher;
import nu.marginalia.wmsa.edge.crawling.retreival.HttpRedirectResolver;
import nu.marginalia.wmsa.edge.crawling.retreival.RateLimitException;
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.junit.jupiter.api.Assertions;
@ -27,14 +28,14 @@ class HttpFetcherTest {
}
@Test
void fetchUTF8() throws URISyntaxException {
void fetchUTF8() throws URISyntaxException, RateLimitException {
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler");
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"));
System.out.println(str.contentType);
}
@Test
void fetchText() throws URISyntaxException {
void fetchText() throws URISyntaxException, RateLimitException {
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler");
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
System.out.println(str);

View File

@ -14,7 +14,6 @@ import org.slf4j.LoggerFactory;
import spark.Spark;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDateTime;
@ -41,7 +40,7 @@ class ResourceStoreServiceTest {
tempDir = Files.createTempDirectory("ResourceStoreServiceTest");
resourceStore = new ResourceEntityStore(tempDir);
service = new ResourceStoreService("127.0.0.1", testPort, null,
resourceStore, new Initialization(), null);
resourceStore, new Initialization(), null, new StaticResources());
Spark.awaitInitialization();
}