Merge pull request 'Crawler fixes, better stylesheet for search' (#104) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/104
This commit is contained in:
Viktor Lofgren 2022-08-24 19:30:50 +02:00
commit be56852c19
4 changed files with 77 additions and 10 deletions

View File

@ -33,7 +33,7 @@ public class CrawlerMain implements AutoCloseable {
private final UserAgent userAgent;
private final ThreadPoolExecutor pool;
final int poolSize = 512;
final int poolSize = Integer.getInteger("crawler.pool-size", 512);
final int poolQueueSize = 32;
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
@ -72,8 +72,7 @@ public class CrawlerMain implements AutoCloseable {
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id))
{
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
var retreiver = new CrawlerRetreiver(fetcher, specification, writer);
int size = retreiver.fetch();

View File

@ -13,6 +13,7 @@ import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeContentType;
import okhttp3.*;
import org.apache.commons.io.input.BOMInputStream;
import org.slf4j.Logger;
@ -20,9 +21,12 @@ import org.slf4j.LoggerFactory;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Objects;
@ -122,6 +126,7 @@ public class HttpFetcher {
return new FetchResult(FetchResultState.OK, requestDomain);
}
catch (Exception ex) {
logger.debug("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
return new FetchResult(FetchResultState.ERROR, url.domain);
}
}
@ -156,7 +161,11 @@ public class HttpFetcher {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
}
}
catch (SocketTimeoutException ex) {
return createTimeoutErrorRsp(url, ex);
}
catch (Exception ex) {
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
return createHardErrorRsp(url, ex);
}
}
@ -167,7 +176,17 @@ public class HttpFetcher {
try (var rsp = call.execute()) {
return extractBody(url, rsp);
}
catch (RateLimitException rle) {
throw rle;
}
catch (SocketTimeoutException ex) {
return createTimeoutErrorRsp(url, ex);
}
catch (IllegalCharsetNameException ex) {
return createHardErrorRsp(url, ex);
}
catch (Exception ex) {
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
return createHardErrorRsp(url, ex);
}
}
@ -180,7 +199,14 @@ public class HttpFetcher {
.url(url.toString())
.build();
}
private CrawledDocument createTimeoutErrorRsp(EdgeUrl url, Exception why) {
return CrawledDocument.builder()
.crawlerStatus("Timeout")
.crawlerStatusDesc(why.getMessage())
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
private CrawledDocument createErrorResponse(EdgeUrl url, Response rsp, CrawlerDocumentStatus status, String why) {
return CrawledDocument.builder()
.crawlerStatus(status.toString())
@ -234,7 +260,7 @@ public class HttpFetcher {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, "");
}
var strData = new String(data, Charset.forName(contentType.charset));
var strData = getStringData(data, contentType);
var canonical = rsp.header("rel=canonical", "");
return CrawledDocument.builder()
@ -249,6 +275,24 @@ public class HttpFetcher {
.build();
}
private String getStringData(byte[] data, EdgeContentType contentType) {
Charset charset;
try {
charset = Charset.forName(contentType.charset);
}
catch (IllegalCharsetNameException ex) {
charset = StandardCharsets.UTF_8;
}
catch (UnsupportedCharsetException ex) {
// This is usually like Macintosh Latin
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
//
// It's close enough to 8859-1 to serve
charset = StandardCharsets.ISO_8859_1;
}
return new String(data, charset);
}
private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) {
return CrawledDocument.builder()

View File

@ -8,7 +8,6 @@ import lombok.SneakyThrows;
import nu.marginalia.wmsa.client.exception.NetworkException;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import okhttp3.Call;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
@ -16,6 +15,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
// TODO: Is this used?
@ -64,10 +64,12 @@ public class HttpRedirectResolver {
.addHeader("Accept-Encoding", "gzip")
.build();
return Observable.just(client.newCall(head))
.map(Call::execute)
.flatMap(data -> resolveRedirects(depth, url, data))
.timeout(10, TimeUnit.SECONDS);
var call = client.newCall(head);
try (var rsp = call.execute()) {
return resolveRedirects(depth, url, rsp);
} catch (IOException e) {
return Observable.error(e);
}
}
@SneakyThrows

View File

@ -1,4 +1,18 @@
/* If you need to borrow something from below, that's fine */
.extra a {
background: #ccc linear-gradient(45deg, rgba(255,220,220,1) 0%, rgba(219,255,196,1) 50%, rgba(212,216,255,1) 100%);
color: #000;
padding: 0.5ch;
border-radius: 0.5ch;
text-decoration: none;
border: 3px outset #000;
word-break: none;
white-space: nowrap;
}
.extra a:active {
border: 3px inset #000;
}
body {
margin: 0px;
@ -395,6 +409,14 @@ a.underline {
/* https://www.youtube.com/watch?v=v0nmHymgM7Y */
@media (prefers-color-scheme: dark) {
.extra a {
background: #000 linear-gradient(45deg, rgba(135,93,93,1) 0%, rgba(106,135,87,1) 50%, rgba(76,83,118,1) 100%);
font-weight: bold;
color: #fff;
border: 3px outset #000;
}
a {
color: #acf;
}