Reduce resource consumption during crawling,

reduce TIME_WAIT sockets with a custom socket
factory.
This commit is contained in:
vlofgren 2022-08-23 13:26:37 +02:00
parent 6fc72b3eb8
commit db4cf70784
3 changed files with 67 additions and 12 deletions

View File

@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import okhttp3.ConnectionPool;
import okhttp3.Dispatcher;
import okhttp3.internal.Util;
import org.slf4j.Logger;
@ -25,6 +26,8 @@ public class CrawlerMain implements AutoCloseable {
private final WorkLog workLog;
private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS);
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
@ -67,9 +70,10 @@ public class CrawlerMain implements AutoCloseable {
if (workLog.isJobFinished(specification.id))
return;
var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher);
try (var writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id))
{
var retreiver = new CrawlerRetreiver(fetcher, specification, writer);
int size = retreiver.fetch();

View File

@ -0,0 +1,51 @@
package nu.marginalia.wmsa.edge.crawling.retreival;
import javax.net.SocketFactory;
import java.io.IOException;
import java.net.InetAddress;
import java.net.Socket;
public class FastTerminatingSocketFactory extends SocketFactory {
private static final SocketFactory delegate = SocketFactory.getDefault();
private void configure(Socket sock) throws IOException {
// Setting SO_LINGER to enabled but low reduces TIME_WAIT
// which can get pretty... bad when you're crawling
// and opening thousands of connections
sock.setSoLinger(true, 3);
}
public Socket createSocket() throws IOException {
var sock = delegate.createSocket();
configure(sock);
return sock;
}
@Override
public Socket createSocket(String host, int port) throws IOException {
var sock = delegate.createSocket(host, port);
configure(sock);
return sock;
}
@Override
public Socket createSocket(String host, int port, InetAddress localHost, int localPort) throws IOException {
var sock = delegate.createSocket(host, port, localHost, localPort);
configure(sock);
return sock;
}
@Override
public Socket createSocket(InetAddress host, int port) throws IOException {
var sock = delegate.createSocket(host, port);
configure(sock);
return sock;
}
@Override
public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort) throws IOException {
var sock = delegate.createSocket(address, port, localAddress, localPort);
configure(sock);
return sock;
}
}

View File

@ -13,10 +13,7 @@ import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import okhttp3.Dispatcher;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import okhttp3.*;
import org.apache.commons.io.input.BOMInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -66,15 +63,18 @@ public class HttpFetcher {
}
}
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
@SneakyThrows
private OkHttpClient createClient(Dispatcher dispatcher) {
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
var builder = new OkHttpClient.Builder();
if (dispatcher != null) {
builder.dispatcher(dispatcher);
}
return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
.socketFactory(ftSocketFactory)
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
.connectionPool(pool)
.cookieJar(cookies.getJar())
.followRedirects(true)
.followSslRedirects(true)
@ -82,6 +82,7 @@ public class HttpFetcher {
.readTimeout(10, TimeUnit.SECONDS)
.writeTimeout(10, TimeUnit.SECONDS)
.build();
}
public List<String> getCookies() {
@ -93,13 +94,13 @@ public class HttpFetcher {
}
@Inject
public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher) {
this.client = createClient(dispatcher);
public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
this.client = createClient(dispatcher, connectionPool);
this.userAgent = userAgent;
}
public HttpFetcher(@Named("user-agent") String userAgent) {
this.client = createClient(null);
this.client = createClient(null, new ConnectionPool());
this.userAgent = userAgent;
}
@ -262,8 +263,6 @@ public class HttpFetcher {
}
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
return fetchRobotsForProto("https", domain)
.or(() -> fetchRobotsForProto("http", domain))
@ -286,4 +285,5 @@ public class HttpFetcher {
doc.contentType,
userAgent);
}
}