Reduce resource consumption during crawling,
reduce TIME_WAIT sockets with a custom socket factory.
This commit is contained in:
parent
6fc72b3eb8
commit
db4cf70784
@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import okhttp3.ConnectionPool;
|
||||
import okhttp3.Dispatcher;
|
||||
import okhttp3.internal.Util;
|
||||
import org.slf4j.Logger;
|
||||
@ -25,6 +26,8 @@ public class CrawlerMain implements AutoCloseable {
|
||||
|
||||
private final WorkLog workLog;
|
||||
|
||||
private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS);
|
||||
|
||||
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
|
||||
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
||||
|
||||
@ -67,9 +70,10 @@ public class CrawlerMain implements AutoCloseable {
|
||||
if (workLog.isJobFinished(specification.id))
|
||||
return;
|
||||
|
||||
var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher);
|
||||
|
||||
try (var writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
||||
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
|
||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id))
|
||||
{
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer);
|
||||
|
||||
int size = retreiver.fetch();
|
||||
|
@ -0,0 +1,51 @@
|
||||
package nu.marginalia.wmsa.edge.crawling.retreival;
|
||||
|
||||
import javax.net.SocketFactory;
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.Socket;
|
||||
|
||||
public class FastTerminatingSocketFactory extends SocketFactory {
|
||||
private static final SocketFactory delegate = SocketFactory.getDefault();
|
||||
|
||||
private void configure(Socket sock) throws IOException {
|
||||
// Setting SO_LINGER to enabled but low reduces TIME_WAIT
|
||||
// which can get pretty... bad when you're crawling
|
||||
// and opening thousands of connections
|
||||
sock.setSoLinger(true, 3);
|
||||
}
|
||||
|
||||
public Socket createSocket() throws IOException {
|
||||
var sock = delegate.createSocket();
|
||||
configure(sock);
|
||||
return sock;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(String host, int port) throws IOException {
|
||||
var sock = delegate.createSocket(host, port);
|
||||
configure(sock);
|
||||
return sock;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(String host, int port, InetAddress localHost, int localPort) throws IOException {
|
||||
var sock = delegate.createSocket(host, port, localHost, localPort);
|
||||
configure(sock);
|
||||
return sock;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(InetAddress host, int port) throws IOException {
|
||||
var sock = delegate.createSocket(host, port);
|
||||
configure(sock);
|
||||
return sock;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort) throws IOException {
|
||||
var sock = delegate.createSocket(address, port, localAddress, localPort);
|
||||
configure(sock);
|
||||
return sock;
|
||||
}
|
||||
}
|
@ -13,10 +13,7 @@ import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import okhttp3.Dispatcher;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import okhttp3.Response;
|
||||
import okhttp3.*;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -66,15 +63,18 @@ public class HttpFetcher {
|
||||
}
|
||||
}
|
||||
|
||||
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
|
||||
@SneakyThrows
|
||||
private OkHttpClient createClient(Dispatcher dispatcher) {
|
||||
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
|
||||
var builder = new OkHttpClient.Builder();
|
||||
if (dispatcher != null) {
|
||||
builder.dispatcher(dispatcher);
|
||||
}
|
||||
|
||||
return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
|
||||
.socketFactory(ftSocketFactory)
|
||||
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
|
||||
.connectionPool(pool)
|
||||
.cookieJar(cookies.getJar())
|
||||
.followRedirects(true)
|
||||
.followSslRedirects(true)
|
||||
@ -82,6 +82,7 @@ public class HttpFetcher {
|
||||
.readTimeout(10, TimeUnit.SECONDS)
|
||||
.writeTimeout(10, TimeUnit.SECONDS)
|
||||
.build();
|
||||
|
||||
}
|
||||
|
||||
public List<String> getCookies() {
|
||||
@ -93,13 +94,13 @@ public class HttpFetcher {
|
||||
}
|
||||
|
||||
@Inject
|
||||
public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher) {
|
||||
this.client = createClient(dispatcher);
|
||||
public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
|
||||
this.client = createClient(dispatcher, connectionPool);
|
||||
this.userAgent = userAgent;
|
||||
}
|
||||
|
||||
public HttpFetcher(@Named("user-agent") String userAgent) {
|
||||
this.client = createClient(null);
|
||||
this.client = createClient(null, new ConnectionPool());
|
||||
this.userAgent = userAgent;
|
||||
}
|
||||
|
||||
@ -262,8 +263,6 @@ public class HttpFetcher {
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
|
||||
return fetchRobotsForProto("https", domain)
|
||||
.or(() -> fetchRobotsForProto("http", domain))
|
||||
@ -286,4 +285,5 @@ public class HttpFetcher {
|
||||
doc.contentType,
|
||||
userAgent);
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user