Merge pull request 'Crawler fixes, better stylesheet for search' (#104) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/104
This commit is contained in:
commit
be56852c19
@ -33,7 +33,7 @@ public class CrawlerMain implements AutoCloseable {
|
||||
|
||||
private final UserAgent userAgent;
|
||||
private final ThreadPoolExecutor pool;
|
||||
final int poolSize = 512;
|
||||
final int poolSize = Integer.getInteger("crawler.pool-size", 512);
|
||||
final int poolQueueSize = 32;
|
||||
|
||||
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
|
||||
@ -72,8 +72,7 @@ public class CrawlerMain implements AutoCloseable {
|
||||
|
||||
|
||||
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
|
||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id))
|
||||
{
|
||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer);
|
||||
|
||||
int size = retreiver.fetch();
|
||||
|
@ -13,6 +13,7 @@ import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeContentType;
|
||||
import okhttp3.*;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.slf4j.Logger;
|
||||
@ -20,9 +21,12 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.IllegalCharsetNameException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.charset.UnsupportedCharsetException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
@ -122,6 +126,7 @@ public class HttpFetcher {
|
||||
return new FetchResult(FetchResultState.OK, requestDomain);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.debug("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||
return new FetchResult(FetchResultState.ERROR, url.domain);
|
||||
}
|
||||
}
|
||||
@ -156,7 +161,11 @@ public class HttpFetcher {
|
||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
|
||||
}
|
||||
}
|
||||
catch (SocketTimeoutException ex) {
|
||||
return createTimeoutErrorRsp(url, ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||
return createHardErrorRsp(url, ex);
|
||||
}
|
||||
}
|
||||
@ -167,7 +176,17 @@ public class HttpFetcher {
|
||||
try (var rsp = call.execute()) {
|
||||
return extractBody(url, rsp);
|
||||
}
|
||||
catch (RateLimitException rle) {
|
||||
throw rle;
|
||||
}
|
||||
catch (SocketTimeoutException ex) {
|
||||
return createTimeoutErrorRsp(url, ex);
|
||||
}
|
||||
catch (IllegalCharsetNameException ex) {
|
||||
return createHardErrorRsp(url, ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||
return createHardErrorRsp(url, ex);
|
||||
}
|
||||
}
|
||||
@ -180,7 +199,14 @@ public class HttpFetcher {
|
||||
.url(url.toString())
|
||||
.build();
|
||||
}
|
||||
|
||||
private CrawledDocument createTimeoutErrorRsp(EdgeUrl url, Exception why) {
|
||||
return CrawledDocument.builder()
|
||||
.crawlerStatus("Timeout")
|
||||
.crawlerStatusDesc(why.getMessage())
|
||||
.timestamp(LocalDateTime.now().toString())
|
||||
.url(url.toString())
|
||||
.build();
|
||||
}
|
||||
private CrawledDocument createErrorResponse(EdgeUrl url, Response rsp, CrawlerDocumentStatus status, String why) {
|
||||
return CrawledDocument.builder()
|
||||
.crawlerStatus(status.toString())
|
||||
@ -234,7 +260,7 @@ public class HttpFetcher {
|
||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, "");
|
||||
}
|
||||
|
||||
var strData = new String(data, Charset.forName(contentType.charset));
|
||||
var strData = getStringData(data, contentType);
|
||||
var canonical = rsp.header("rel=canonical", "");
|
||||
|
||||
return CrawledDocument.builder()
|
||||
@ -249,6 +275,24 @@ public class HttpFetcher {
|
||||
.build();
|
||||
}
|
||||
|
||||
private String getStringData(byte[] data, EdgeContentType contentType) {
|
||||
Charset charset;
|
||||
try {
|
||||
charset = Charset.forName(contentType.charset);
|
||||
}
|
||||
catch (IllegalCharsetNameException ex) {
|
||||
charset = StandardCharsets.UTF_8;
|
||||
}
|
||||
catch (UnsupportedCharsetException ex) {
|
||||
// This is usually like Macintosh Latin
|
||||
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
|
||||
//
|
||||
// It's close enough to 8859-1 to serve
|
||||
charset = StandardCharsets.ISO_8859_1;
|
||||
}
|
||||
return new String(data, charset);
|
||||
}
|
||||
|
||||
private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) {
|
||||
|
||||
return CrawledDocument.builder()
|
||||
|
@ -8,7 +8,6 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.client.exception.NetworkException;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import okhttp3.Call;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import okhttp3.Response;
|
||||
@ -16,6 +15,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
// TODO: Is this used?
|
||||
@ -64,10 +64,12 @@ public class HttpRedirectResolver {
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
return Observable.just(client.newCall(head))
|
||||
.map(Call::execute)
|
||||
.flatMap(data -> resolveRedirects(depth, url, data))
|
||||
.timeout(10, TimeUnit.SECONDS);
|
||||
var call = client.newCall(head);
|
||||
try (var rsp = call.execute()) {
|
||||
return resolveRedirects(depth, url, rsp);
|
||||
} catch (IOException e) {
|
||||
return Observable.error(e);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
|
@ -1,4 +1,18 @@
|
||||
/* If you need to borrow something from below, that's fine */
|
||||
.extra a {
|
||||
background: #ccc linear-gradient(45deg, rgba(255,220,220,1) 0%, rgba(219,255,196,1) 50%, rgba(212,216,255,1) 100%);
|
||||
color: #000;
|
||||
padding: 0.5ch;
|
||||
border-radius: 0.5ch;
|
||||
text-decoration: none;
|
||||
border: 3px outset #000;
|
||||
word-break: none;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.extra a:active {
|
||||
border: 3px inset #000;
|
||||
}
|
||||
|
||||
body {
|
||||
margin: 0px;
|
||||
@ -395,6 +409,14 @@ a.underline {
|
||||
|
||||
/* https://www.youtube.com/watch?v=v0nmHymgM7Y */
|
||||
@media (prefers-color-scheme: dark) {
|
||||
.extra a {
|
||||
background: #000 linear-gradient(45deg, rgba(135,93,93,1) 0%, rgba(106,135,87,1) 50%, rgba(76,83,118,1) 100%);
|
||||
font-weight: bold;
|
||||
color: #fff;
|
||||
border: 3px outset #000;
|
||||
}
|
||||
|
||||
|
||||
a {
|
||||
color: #acf;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user