Merge pull request 'master' (#99) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/99
This commit is contained in:
commit
2f8cab7f0e
@ -3,7 +3,7 @@ server {
|
||||
listen [::]:80;
|
||||
server_name nginx;
|
||||
|
||||
location /search {
|
||||
location / {
|
||||
if ( $request_method = POST ) {
|
||||
return 444;
|
||||
}
|
||||
@ -14,12 +14,7 @@ server {
|
||||
proxy_set_header X-Extern-Domain $scheme://$host;
|
||||
proxy_set_header X-User-Agent $http_user_agent;
|
||||
|
||||
proxy_pass http://edge-search:5023/public/search;
|
||||
tcp_nodelay on;
|
||||
}
|
||||
|
||||
location / {
|
||||
proxy_pass http://edge-search:5023/;
|
||||
proxy_pass http://edge-search:5023/public/;
|
||||
tcp_nodelay on;
|
||||
}
|
||||
}
|
||||
|
@ -5,6 +5,8 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@ -41,14 +43,17 @@ public class FeatureExtractor {
|
||||
}
|
||||
|
||||
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
|
||||
Set<HtmlFeature> features = new HashSet<>();
|
||||
final Set<HtmlFeature> features = new HashSet<>();
|
||||
|
||||
var scriptTags = doc.getElementsByTag("script");
|
||||
final Elements scriptTags = doc.getElementsByTag("script");
|
||||
|
||||
if (scriptTags.size() > 0) {
|
||||
features.add(HtmlFeature.JS);
|
||||
for (var scriptTag : scriptTags) {
|
||||
if (isJavascriptTag(scriptTag)) {
|
||||
features.add(HtmlFeature.JS);
|
||||
}
|
||||
}
|
||||
else if(adblockSimulator.hasAds(doc.clone())) { // Only look for ads if there is javascript
|
||||
|
||||
if (features.contains(HtmlFeature.JS) && adblockSimulator.hasAds(doc.clone())) {
|
||||
features.add(HtmlFeature.ADVERTISEMENT);
|
||||
}
|
||||
|
||||
@ -58,20 +63,22 @@ public class FeatureExtractor {
|
||||
features.add(HtmlFeature.MEDIA);
|
||||
}
|
||||
|
||||
if (scriptTags.stream()
|
||||
.anyMatch(tag -> trackers.stream().anyMatch(tracker -> tag.attr("src").contains(tracker)))) {
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
for (var scriptTag : scriptTags) {
|
||||
if (hasTrackingScript(scriptTag)) {
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (scriptTags.html().contains("google-analytics.com")) {
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
}
|
||||
|
||||
if (doc.getElementsByTag("a").stream().map(e -> e.attr("href"))
|
||||
.map(String::toLowerCase)
|
||||
.anyMatch(href ->
|
||||
href.contains("amzn.to/") || (href.contains("amazon.com/") & href.contains("tag=")))) {
|
||||
features.add(HtmlFeature.AFFILIATE_LINK);
|
||||
for (var aTag : doc.getElementsByTag("a")) {
|
||||
if (isAmazonAffiliateLink(aTag)) {
|
||||
features.add(HtmlFeature.AFFILIATE_LINK);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!domain.cookies.isEmpty()) {
|
||||
@ -80,4 +87,34 @@ public class FeatureExtractor {
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
private boolean hasTrackingScript(Element scriptTag) {
|
||||
for (var tracker : trackers) {
|
||||
if (scriptTag.attr("src").contains(tracker)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean isJavascriptTag(Element scriptTag) {
|
||||
final String type = scriptTag.attr("type");
|
||||
|
||||
if ("application/ld+json".equalsIgnoreCase(type)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean isAmazonAffiliateLink(Element aTag) {
|
||||
final String href = aTag.attr("href").toLowerCase();
|
||||
|
||||
if (href.contains("amzn.to/"))
|
||||
return true;
|
||||
if (href.contains("amazon.com/") && href.contains("tag="))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import okhttp3.ConnectionPool;
|
||||
import okhttp3.Dispatcher;
|
||||
import okhttp3.internal.Util;
|
||||
import org.slf4j.Logger;
|
||||
@ -25,12 +26,14 @@ public class CrawlerMain implements AutoCloseable {
|
||||
|
||||
private final WorkLog workLog;
|
||||
|
||||
private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS);
|
||||
|
||||
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
|
||||
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
||||
|
||||
private final UserAgent userAgent;
|
||||
private final ThreadPoolExecutor pool;
|
||||
final int poolSize = 256;
|
||||
final int poolSize = 512;
|
||||
final int poolQueueSize = 32;
|
||||
|
||||
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
|
||||
@ -67,9 +70,10 @@ public class CrawlerMain implements AutoCloseable {
|
||||
if (workLog.isJobFinished(specification.id))
|
||||
return;
|
||||
|
||||
var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher);
|
||||
|
||||
try (var writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
||||
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
|
||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id))
|
||||
{
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer);
|
||||
|
||||
int size = retreiver.fetch();
|
||||
@ -92,7 +96,6 @@ public class CrawlerMain implements AutoCloseable {
|
||||
|
||||
AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||
|
||||
|
||||
Semaphore taskSem = new Semaphore(poolSize);
|
||||
|
||||
plan.forEachCrawlingSpecification(spec -> {
|
||||
|
@ -17,7 +17,6 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.ArrayList;
|
||||
@ -25,12 +24,19 @@ import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Optional;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class CrawlerRetreiver {
|
||||
private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000);
|
||||
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 250);
|
||||
private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
|
||||
|
||||
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
|
||||
private final HttpFetcher fetcher;
|
||||
private final HashSet<EdgeUrl> visited;
|
||||
private final HashSet<EdgeUrl> known;
|
||||
|
||||
private final HashSet<String> visited;
|
||||
private final HashSet<String> known;
|
||||
private boolean slowDown = false;
|
||||
|
||||
private final int depth;
|
||||
private final String id;
|
||||
@ -64,15 +70,13 @@ public class CrawlerRetreiver {
|
||||
crawledDomainWriter = writer;
|
||||
|
||||
for (String urlStr : specs.urls) {
|
||||
EdgeUrl.parse(urlStr)
|
||||
.filter(known::add)
|
||||
.ifPresent(queue::addLast);
|
||||
EdgeUrl.parse(urlStr).ifPresent(this::addToQueue);
|
||||
}
|
||||
|
||||
if (queue.peek() != null) {
|
||||
var fst = queue.peek();
|
||||
var root = fst.domain.toRootUrl();
|
||||
if (known.add(root))
|
||||
if (known.add(root.toString()))
|
||||
queue.addFirst(root);
|
||||
}
|
||||
}
|
||||
@ -147,7 +151,7 @@ public class CrawlerRetreiver {
|
||||
continue;
|
||||
if (top.toString().length() > 255)
|
||||
continue;
|
||||
if (!visited.add(top))
|
||||
if (!visited.add(top.toString()))
|
||||
continue;
|
||||
|
||||
if (fetchDocument(top, crawlDelay)) {
|
||||
@ -172,9 +176,7 @@ public class CrawlerRetreiver {
|
||||
crawledDomainWriter.accept(d);
|
||||
|
||||
if (d.url != null) {
|
||||
try {
|
||||
visited.add(new EdgeUrl(d.url));
|
||||
} catch (URISyntaxException ex) {}
|
||||
EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add);
|
||||
}
|
||||
|
||||
}
|
||||
@ -192,8 +194,7 @@ public class CrawlerRetreiver {
|
||||
|
||||
private Optional<CrawledDocument> fetchUrl(EdgeUrl top) {
|
||||
try {
|
||||
|
||||
var doc = fetcher.fetchContent(top);
|
||||
var doc = fetchContent(top);
|
||||
|
||||
if (doc.documentBody != null) {
|
||||
|
||||
@ -217,6 +218,24 @@ public class CrawlerRetreiver {
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private CrawledDocument fetchContent(EdgeUrl top) {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
try {
|
||||
return fetcher.fetchContent(top);
|
||||
}
|
||||
catch (RateLimitException ex) {
|
||||
slowDown = true;
|
||||
int delay = ex.retryAfter();
|
||||
if (delay > 0 && delay < 5000) {
|
||||
Thread.sleep(delay);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return createRetryError(top);
|
||||
}
|
||||
|
||||
private String createHash(String documentBodyHash) {
|
||||
return hashMethod.hashUnencodedChars(documentBodyHash).toString();
|
||||
}
|
||||
@ -235,28 +254,29 @@ public class CrawlerRetreiver {
|
||||
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
|
||||
|
||||
for (var link : parsed.getElementsByTag("a")) {
|
||||
linkParser.parseLink(baseUrl, link)
|
||||
.filter(this::isSameDomain)
|
||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||
.filter(u -> !urlBlocklist.isMailingListLink(u))
|
||||
.filter(known::add)
|
||||
.ifPresent(queue::addLast);
|
||||
linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue);
|
||||
}
|
||||
for (var link : parsed.getElementsByTag("frame")) {
|
||||
linkParser.parseFrame(baseUrl, link)
|
||||
.filter(this::isSameDomain)
|
||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||
.filter(u -> !urlBlocklist.isMailingListLink(u))
|
||||
.filter(known::add)
|
||||
.ifPresent(queue::addLast);
|
||||
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
|
||||
}
|
||||
for (var link : parsed.getElementsByTag("iframe")) {
|
||||
linkParser.parseFrame(baseUrl, link)
|
||||
.filter(this::isSameDomain)
|
||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||
.filter(u -> !urlBlocklist.isMailingListLink(u))
|
||||
.filter(known::add)
|
||||
.ifPresent(queue::addLast);
|
||||
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
|
||||
}
|
||||
}
|
||||
|
||||
private void addToQueue(EdgeUrl url) {
|
||||
if (!isSameDomain(url))
|
||||
return;
|
||||
if (urlBlocklist.isUrlBlocked(url))
|
||||
return;
|
||||
if (urlBlocklist.isMailingListLink(url))
|
||||
return;
|
||||
// reduce memory usage by not growing queue huge when crawling large sites
|
||||
if (queue.size() + visited.size() >= depth + 100)
|
||||
return;
|
||||
|
||||
if (known.add(url.toString())) {
|
||||
queue.addLast(url);
|
||||
}
|
||||
}
|
||||
|
||||
@ -284,13 +304,24 @@ public class CrawlerRetreiver {
|
||||
if (spentTime > sleepTime)
|
||||
return;
|
||||
|
||||
Thread.sleep(Math.min(sleepTime-spentTime, 5000));
|
||||
Thread.sleep(min(sleepTime-spentTime, 5000));
|
||||
}
|
||||
else if (slowDown) {
|
||||
Thread.sleep( 1000);
|
||||
}
|
||||
else {
|
||||
if (spentTime > DEFAULT_CRAWL_DELAY_MS)
|
||||
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
||||
// within sane limits. This means slower servers get slower crawling, and faster
|
||||
// servers get faster crawling.
|
||||
|
||||
sleepTime = spentTime * 2;
|
||||
sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS);
|
||||
sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS);
|
||||
|
||||
if (spentTime > sleepTime)
|
||||
return;
|
||||
|
||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MS - spentTime);
|
||||
Thread.sleep(sleepTime-spentTime);
|
||||
}
|
||||
}
|
||||
|
||||
@ -302,7 +333,14 @@ public class CrawlerRetreiver {
|
||||
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
|
||||
.build();
|
||||
}
|
||||
|
||||
private CrawledDocument createRetryError(EdgeUrl url) {
|
||||
return CrawledDocument.builder()
|
||||
.url(url.toString())
|
||||
.timestamp(LocalDateTime.now().toString())
|
||||
.httpStatus(429)
|
||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
|
||||
.build();
|
||||
}
|
||||
private CrawledDomain createErrorPostFromStatus(HttpFetcher.FetchResult ret) {
|
||||
String ip = findIp(domain);
|
||||
|
||||
|
@ -0,0 +1,51 @@
|
||||
package nu.marginalia.wmsa.edge.crawling.retreival;
|
||||
|
||||
import javax.net.SocketFactory;
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.Socket;
|
||||
|
||||
public class FastTerminatingSocketFactory extends SocketFactory {
|
||||
private static final SocketFactory delegate = SocketFactory.getDefault();
|
||||
|
||||
private void configure(Socket sock) throws IOException {
|
||||
// Setting SO_LINGER to enabled but low reduces TIME_WAIT
|
||||
// which can get pretty... bad when you're crawling
|
||||
// and opening thousands of connections
|
||||
sock.setSoLinger(true, 3);
|
||||
}
|
||||
|
||||
public Socket createSocket() throws IOException {
|
||||
var sock = delegate.createSocket();
|
||||
configure(sock);
|
||||
return sock;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(String host, int port) throws IOException {
|
||||
var sock = delegate.createSocket(host, port);
|
||||
configure(sock);
|
||||
return sock;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(String host, int port, InetAddress localHost, int localPort) throws IOException {
|
||||
var sock = delegate.createSocket(host, port, localHost, localPort);
|
||||
configure(sock);
|
||||
return sock;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(InetAddress host, int port) throws IOException {
|
||||
var sock = delegate.createSocket(host, port);
|
||||
configure(sock);
|
||||
return sock;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort) throws IOException {
|
||||
var sock = delegate.createSocket(address, port, localAddress, localPort);
|
||||
configure(sock);
|
||||
return sock;
|
||||
}
|
||||
}
|
@ -13,10 +13,7 @@ import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import okhttp3.Dispatcher;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import okhttp3.Response;
|
||||
import okhttp3.*;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -66,15 +63,18 @@ public class HttpFetcher {
|
||||
}
|
||||
}
|
||||
|
||||
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
|
||||
@SneakyThrows
|
||||
private OkHttpClient createClient(Dispatcher dispatcher) {
|
||||
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
|
||||
var builder = new OkHttpClient.Builder();
|
||||
if (dispatcher != null) {
|
||||
builder.dispatcher(dispatcher);
|
||||
}
|
||||
|
||||
return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
|
||||
.socketFactory(ftSocketFactory)
|
||||
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
|
||||
.connectionPool(pool)
|
||||
.cookieJar(cookies.getJar())
|
||||
.followRedirects(true)
|
||||
.followSslRedirects(true)
|
||||
@ -82,6 +82,7 @@ public class HttpFetcher {
|
||||
.readTimeout(10, TimeUnit.SECONDS)
|
||||
.writeTimeout(10, TimeUnit.SECONDS)
|
||||
.build();
|
||||
|
||||
}
|
||||
|
||||
public List<String> getCookies() {
|
||||
@ -93,13 +94,13 @@ public class HttpFetcher {
|
||||
}
|
||||
|
||||
@Inject
|
||||
public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher) {
|
||||
this.client = createClient(dispatcher);
|
||||
public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
|
||||
this.client = createClient(dispatcher, connectionPool);
|
||||
this.userAgent = userAgent;
|
||||
}
|
||||
|
||||
public HttpFetcher(@Named("user-agent") String userAgent) {
|
||||
this.client = createClient(null);
|
||||
this.client = createClient(null, new ConnectionPool());
|
||||
this.userAgent = userAgent;
|
||||
}
|
||||
|
||||
@ -141,7 +142,7 @@ public class HttpFetcher {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public CrawledDocument fetchContent(EdgeUrl url) {
|
||||
public CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException {
|
||||
|
||||
if (contentTypeLogic.isUrlLikeBinary(url)) {
|
||||
logger.debug("Probing suspected binary {}", url);
|
||||
@ -192,13 +193,17 @@ public class HttpFetcher {
|
||||
.build();
|
||||
}
|
||||
|
||||
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException {
|
||||
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException, RateLimitException {
|
||||
|
||||
var responseUrl = new EdgeUrl(rsp.request().url().toString());
|
||||
if (!Objects.equals(responseUrl.domain, url.domain)) {
|
||||
return createRedirectResponse(url, rsp, responseUrl);
|
||||
}
|
||||
|
||||
if (rsp.code() == 429) {
|
||||
throw new RateLimitException(rsp.header("Retry-After", "1000"));
|
||||
}
|
||||
|
||||
var body = rsp.body();
|
||||
if (null == body) {
|
||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.ERROR, "No body");
|
||||
@ -258,8 +263,6 @@ public class HttpFetcher {
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
|
||||
return fetchRobotsForProto("https", domain)
|
||||
.or(() -> fetchRobotsForProto("http", domain))
|
||||
@ -282,4 +285,5 @@ public class HttpFetcher {
|
||||
doc.contentType,
|
||||
userAgent);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,21 @@
|
||||
package nu.marginalia.wmsa.edge.crawling.retreival;
|
||||
|
||||
public class RateLimitException extends Exception {
|
||||
private final String retryAfter;
|
||||
|
||||
public RateLimitException(String retryAfter) {
|
||||
this.retryAfter = retryAfter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
|
||||
|
||||
public int retryAfter() {
|
||||
try {
|
||||
return Integer.parseInt(retryAfter);
|
||||
}
|
||||
catch (NumberFormatException ex) {
|
||||
return 1000;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.wmsa.edge.search.command;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.search.model.BrowseResultSet;
|
||||
import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner;
|
||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@Singleton
|
||||
public class IndexCommand {
|
||||
|
||||
private final EdgeDataStoreDao dataStoreDao;
|
||||
private final BrowseResultCleaner browseResultCleaner;
|
||||
private final MustacheRenderer<BrowseResultSet> template;
|
||||
private final EdgeDomainBlacklist blacklist;
|
||||
@Inject
|
||||
public IndexCommand(EdgeDataStoreDao dataStoreDao, RendererFactory rendererFactory, BrowseResultCleaner browseResultCleaner, EdgeDomainBlacklist blacklist) throws IOException {
|
||||
this.dataStoreDao = dataStoreDao;
|
||||
this.browseResultCleaner = browseResultCleaner;
|
||||
|
||||
template = rendererFactory.renderer("edge/index");
|
||||
this.blacklist = blacklist;
|
||||
}
|
||||
|
||||
public String render(Request request, Response response) {
|
||||
response.header("Cache-control", "public,max-age=3600");
|
||||
|
||||
var results = dataStoreDao.getRandomDomains(5, blacklist, 0);
|
||||
results.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
|
||||
|
||||
return template.render(new BrowseResultSet(results.stream().limit(1).toList()));
|
||||
}
|
||||
}
|
@ -22,7 +22,6 @@ import spark.Spark;
|
||||
import spark.resource.ClassPathResource;
|
||||
import spark.staticfiles.MimeType;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.net.URLEncoder;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneOffset;
|
||||
@ -35,6 +34,7 @@ public class ResourceStoreService extends Service {
|
||||
|
||||
private final AuthClient authClient;
|
||||
private final ResourceEntityStore resourceStore;
|
||||
private StaticResources staticResources;
|
||||
|
||||
@Inject
|
||||
public ResourceStoreService(@Named("service-host") String ip,
|
||||
@ -42,11 +42,13 @@ public class ResourceStoreService extends Service {
|
||||
AuthClient authClient,
|
||||
ResourceEntityStore resourceStore,
|
||||
Initialization initialization,
|
||||
MetricsServer metricsServer
|
||||
MetricsServer metricsServer,
|
||||
StaticResources staticResources
|
||||
) {
|
||||
super(ip, port, initialization, metricsServer);
|
||||
this.authClient = authClient;
|
||||
this.resourceStore = resourceStore;
|
||||
this.staticResources = staticResources;
|
||||
|
||||
Schedulers.io().schedulePeriodicallyDirect(resourceStore::reapStaleResources,
|
||||
5, 5, TimeUnit.MINUTES);
|
||||
@ -109,12 +111,9 @@ public class ResourceStoreService extends Service {
|
||||
|
||||
return serveDynamic(data, request, response);
|
||||
}
|
||||
else if (serveStatic(domain + "/" + resource, request, response)) {
|
||||
logger.info("getResource({}/{}, static)", domain, resource);
|
||||
}
|
||||
else {
|
||||
logger.info("Could not serve {}/{}", domain, resource);
|
||||
Spark.halt(404, "Not Found");
|
||||
logger.info("getResource({}/{}, static)", domain, resource);
|
||||
staticResources.serveStatic(domain, resource, request, response);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
@ -138,19 +137,7 @@ public class ResourceStoreService extends Service {
|
||||
return data.data;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private boolean serveStatic(String path, Request req, Response rsp) {
|
||||
try {
|
||||
ClassPathResource resource = new ClassPathResource("static/" + path);
|
||||
handleEtagStatic(resource, req, rsp);
|
||||
resource.getInputStream().transferTo(rsp.raw().getOutputStream());
|
||||
}
|
||||
catch (IllegalArgumentException|FileNotFoundException ex) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void handleEtag(RenderedResource page, Request req, Response rsp) {
|
||||
|
@ -0,0 +1,46 @@
|
||||
package nu.marginalia.wmsa.resource_store;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
import spark.resource.ClassPathResource;
|
||||
import spark.staticfiles.MimeType;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneOffset;
|
||||
|
||||
public class StaticResources {
|
||||
private final long startTime = LocalDateTime.now().toEpochSecond(ZoneOffset.UTC);
|
||||
|
||||
@SneakyThrows
|
||||
public void serveStatic(String domain, String path, Request req, Response rsp) {
|
||||
try {
|
||||
ClassPathResource resource = new ClassPathResource("static/" + domain + "/" + path);
|
||||
handleEtagStatic(resource, req, rsp);
|
||||
resource.getInputStream().transferTo(rsp.raw().getOutputStream());
|
||||
}
|
||||
catch (IllegalArgumentException | FileNotFoundException ex) {
|
||||
Spark.halt(404);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void handleEtagStatic(ClassPathResource resource, Request req, Response rsp) {
|
||||
rsp.header("Cache-Control", "public,max-age=3600");
|
||||
rsp.type(MimeType.fromResource(resource));
|
||||
|
||||
final String etag = staticResourceEtag(resource.getFilename());
|
||||
|
||||
if (etag.equals(req.headers("If-None-Match"))) {
|
||||
Spark.halt(304);
|
||||
}
|
||||
|
||||
rsp.header("ETag", etag);
|
||||
}
|
||||
|
||||
private String staticResourceEtag(String resource) {
|
||||
return "\"" + resource.hashCode() + "-" + startTime + "\"";
|
||||
}
|
||||
}
|
@ -7,6 +7,12 @@ body {
|
||||
background-color: #f8f8ee;
|
||||
}
|
||||
|
||||
.rightbox {
|
||||
float: right;
|
||||
display: block;
|
||||
max-width: 40ch;
|
||||
clear: both;
|
||||
}
|
||||
|
||||
.sticker {
|
||||
ruby-position: under;
|
||||
@ -70,6 +76,9 @@ ul.semantic-results a {
|
||||
|
||||
article > section > p { display: none; }
|
||||
|
||||
.cards.big .card { flex-grow: 1 }
|
||||
.cards.big { padding-right: 1ch; }
|
||||
|
||||
.w3m-helper {
|
||||
display: none;
|
||||
}
|
||||
@ -296,6 +305,7 @@ select {
|
||||
}
|
||||
|
||||
footer {
|
||||
clear: both;
|
||||
padding: 2ch;
|
||||
margin: 16ch 0px 0px 0px;
|
||||
background-color: #acae89;
|
||||
@ -337,7 +347,7 @@ a.underline {
|
||||
}
|
||||
|
||||
@media only screen and (max-device-width: 1024px) {
|
||||
|
||||
.rightbox { width: 30ch !important; }
|
||||
.card {
|
||||
margin-right: 2ch;
|
||||
}
|
||||
@ -355,6 +365,7 @@ a.underline {
|
||||
}
|
||||
|
||||
@media only screen and (max-device-width: 800px) {
|
||||
.rightbox { display: none; }
|
||||
.search-box {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
@ -0,0 +1,12 @@
|
||||
<section class="card browse-result rightbox">
|
||||
<h2>{{url.domain}}</h2>
|
||||
|
||||
<a href="{{url.proto}}://{{url.domain}}/">
|
||||
<img src="/screenshot/{{domainId}}" title="{{description}}" loading="lazy"/>
|
||||
</a>
|
||||
|
||||
<div class="utils">
|
||||
<a href="/site/{{url.domain}}">Info</a>
|
||||
<a href="/explore/{{url.domain}}">Similar Domains</a>
|
||||
</div>
|
||||
</section>
|
130
marginalia_nu/src/main/resources/templates/edge/index.hdb
Normal file
130
marginalia_nu/src/main/resources/templates/edge/index.hdb
Normal file
@ -0,0 +1,130 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Marginalia Search}</title>
|
||||
|
||||
<link rel="stylesheet" href="/style-new.css" />
|
||||
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<meta name="robots" content="noindex" />
|
||||
|
||||
<meta property="og:description" content="search.marginalia.nu is a small independent do-it-yourself search engine for surprising but content-rich websites that never ask you to accept cookies or subscribe to newsletters. The goal is to bring you the sort of grass fed, free range HTML your grandma used to write. " />
|
||||
<meta property="og:locale" content="en_US" />
|
||||
<meta property="og:site_name" content="search.marginalia.nu" />
|
||||
<meta property="og:type" content="website" />
|
||||
<meta property="og:url" content="https://search.marginalia.nu/" />
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<!-- Hi there, fellow human being :-) -->
|
||||
|
||||
{{>edge/parts/search-header}}
|
||||
|
||||
<article>
|
||||
{{>edge/parts/search-form}}
|
||||
|
||||
<section class="card rightbox">
|
||||
<h2>Publicity, Discussion and Events</h2>
|
||||
<div class="info">
|
||||
<dl>
|
||||
<dt><a href="https://www.deutschlandfunkkultur.de/google-suche-100.html">Kritik an Googles Suche - Platzhirsch auf dem Nebenschauplatz</a></dt>
|
||||
<dd>Deutschlandfunk Kultur 🇩🇪, 2022-08-18</dd>
|
||||
<dt><a href="https://news.ycombinator.com/item?id=31536626" rel="nofollow">Marginalia Goes Open Source</a></dt>
|
||||
<dd>Hacker News, 2022-05-28</dd>
|
||||
<dt><a href="https://www.youtube.com/watch?v=rTSEr0cRJY8" rel="nofollow">You Should Check Out the Indie Web</a> 🎞️</dt>
|
||||
<dd>YouTube, You've Got Kat, 2022-03-15 </dd>
|
||||
<dt>
|
||||
<a href="https://www.newyorker.com/culture/infinite-scroll/what-google-search-isnt-showing-you" rel="nofollow">What Google Search Isn't Showing You</a>
|
||||
</dt>
|
||||
<dd>The New Yorker 🎩, 2022-03-10</dd>
|
||||
<dt>
|
||||
<a href="https://www.metafilter.com/194653/Marginalia-Search-Serendipity-Engineering" rel="nofollow">Marginalia Search - Serendipity Engineering</a>
|
||||
</dt>
|
||||
<dd>MetaFilter, 2022-03-09</dd>
|
||||
<dt>
|
||||
🎂 <a href="https://memex.marginalia.nu/log/49-marginalia-1-year.gmi">First anniversary</a>! 🎊
|
||||
</dt>
|
||||
<dd>
|
||||
2022-02-26
|
||||
</dd>
|
||||
<dt>
|
||||
<a href="https://onezero.medium.com/a-search-engine-designed-to-surprise-you-b81944ed5c06" rel="nofollow">A Search Engine Designed To Surprise You</a>
|
||||
</dt>
|
||||
<dd>Clive Thompson OneZero, 2021-09-16</dd>
|
||||
<dt>
|
||||
<a href="https://news.ycombinator.com/item?id=28550764" rel="nofollow"> A search engine that favors text-heavy sites and punishes modern web design</a>
|
||||
</dt>
|
||||
<dd>
|
||||
Hacker News, 2021-09-16
|
||||
</dd>
|
||||
</dl>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<div class="cards big">
|
||||
<section class="card">
|
||||
<h2>About</h2>
|
||||
<div class="info">
|
||||
<p>This is an independent DIY search engine that focuses on non-commercial content, and attempts to
|
||||
show you sites you perhaps weren't aware of in favor of the sort of sites you probably already knew
|
||||
existed. </p>
|
||||
<p>
|
||||
The software for this search engine is all custom-built, and all crawling and indexing is
|
||||
done in-house. The project is open source. Feel free to poke about in the <a
|
||||
href="https://git.marginalia.nu/marginalia/marginalia.nu">source code</a> or contribute
|
||||
to the development!
|
||||
</p>
|
||||
<p>Consider <a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">supporting the
|
||||
project</a>!</p>
|
||||
</div>
|
||||
<div class="utils">
|
||||
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">Read More</a>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Tips</h2>
|
||||
<div class="info">
|
||||
<p>
|
||||
This search engine isn't particularly well equipped to answering queries
|
||||
posed like questions, instead try to imagine some text that might appear
|
||||
in the website you are looking for, and search for that.</p>
|
||||
<p>
|
||||
Where this search engine really shines is finding small, old and obscure websites about some
|
||||
given topic, perhaps
|
||||
<a href="https://search.marginalia.nu/search?query=commander+keen&profile=yolo&js=default">old video games</a>,
|
||||
<a href="https://search.marginalia.nu/search?query=voynich+&profile=yolo&js=default">a mystery</a>,
|
||||
<a href="https://search.marginalia.nu/search?query=augustine+confessions&profile=yolo&js=default">theology</a>,
|
||||
<a href="https://search.marginalia.nu/search?query=Hermes+Trismegistus&profile=yolo&js=default">the occult</a>,
|
||||
<a href="https://search.marginalia.nu/search?query=knitting&profile=yolo&js=default">knitting</a>,
|
||||
<a href="https://search.marginalia.nu/search?query=scc+graph+algorithm&profile=yolo&js=default">computer science</a>,
|
||||
or <a href="https://search.marginalia.nu/search?query=salvador+dali&profile=yolo&js=default">art</a>.
|
||||
</p>
|
||||
|
||||
</div>
|
||||
<div class="utils">
|
||||
<a href="https://memex.marginalia.nu/projects/edge/search-tips.gmi">Additional Tips</a>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
|
||||
<section class="card">
|
||||
<h2>Updates</h2>
|
||||
<div class="info">
|
||||
<p>☛ A recipe filter has been added to the algorithm selector.</p>
|
||||
<p>☛ The <a href="https://search.marginalia.nu/explore/random">Random Mode</a> has been overhauled, and is
|
||||
quite entertaining. I encourage you to give it a spin. </p>
|
||||
<p>☛ A simple <a href="https://api.marginalia.nu/">public API</a> is now available.</p>
|
||||
</div>
|
||||
<div class="utils">
|
||||
<a href="https://memex.marginalia.nu/projects/edge/changelog.gmi">Change Log</a>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
</div>
|
||||
</article>
|
||||
|
||||
{{>edge/parts/search-footer}}
|
||||
</body>
|
@ -18,7 +18,9 @@
|
||||
|
||||
<article>
|
||||
{{>edge/parts/search-form}}
|
||||
<hr class="w3m-helper" />
|
||||
|
||||
{{#each domainResults}}{{>edge/browse-result-rb}}{{/each}}
|
||||
|
||||
<section class="cards">
|
||||
{{#if maintenanceMessage}}<section class="card problems onlyscreen"><h2>Maintenance</h2><p class="description">{{maintenanceMessage}}</p></section>{{/if}}
|
||||
{{#if evalResult}}<section class="card semantic onlyscreen"><h2>Evaluation</h2><p class="description">{{query}} = {{evalResult}}</p><hr class="w3m-helper" /></section>{{/if}}
|
||||
@ -37,7 +39,6 @@
|
||||
</section>
|
||||
{{/if}}
|
||||
|
||||
{{#each domainResults}}{{>edge/browse-result}}{{/each}}
|
||||
{{#each results}}{{>edge/search-result}}{{/each}}
|
||||
|
||||
{{#unless evalResult}}{{#if problems}}<section class="card problems onlyscreen"><h2>Suggestions</h2><ul class="onlyscreen search-problems">{{#each problems}}<li>{{{.}}}</li>{{/each}}</ul></section> {{/if}}{{/unless}}
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.crawling;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.HttpRedirectResolver;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.RateLimitException;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
@ -27,14 +28,14 @@ class HttpFetcherTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void fetchUTF8() throws URISyntaxException {
|
||||
void fetchUTF8() throws URISyntaxException, RateLimitException {
|
||||
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler");
|
||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"));
|
||||
System.out.println(str.contentType);
|
||||
}
|
||||
|
||||
@Test
|
||||
void fetchText() throws URISyntaxException {
|
||||
void fetchText() throws URISyntaxException, RateLimitException {
|
||||
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler");
|
||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
|
||||
System.out.println(str);
|
||||
|
@ -14,7 +14,6 @@ import org.slf4j.LoggerFactory;
|
||||
import spark.Spark;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.LocalDateTime;
|
||||
@ -41,7 +40,7 @@ class ResourceStoreServiceTest {
|
||||
tempDir = Files.createTempDirectory("ResourceStoreServiceTest");
|
||||
resourceStore = new ResourceEntityStore(tempDir);
|
||||
service = new ResourceStoreService("127.0.0.1", testPort, null,
|
||||
resourceStore, new Initialization(), null);
|
||||
resourceStore, new Initialization(), null, new StaticResources());
|
||||
|
||||
Spark.awaitInitialization();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user