diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java index cc79d9b0..dbb2dcfc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java @@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver; import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import okhttp3.ConnectionPool; import okhttp3.Dispatcher; import okhttp3.internal.Util; import org.slf4j.Logger; @@ -25,6 +26,8 @@ public class CrawlerMain implements AutoCloseable { private final WorkLog workLog; + private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS); + private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS, new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true))); @@ -67,9 +70,10 @@ public class CrawlerMain implements AutoCloseable { if (workLog.isJobFinished(specification.id)) return; - var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher); - try (var writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { + HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool); + try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) + { var retreiver = new CrawlerRetreiver(fetcher, specification, writer); int size = retreiver.fetch(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/FastTerminatingSocketFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/FastTerminatingSocketFactory.java new file mode 100644 index 00000000..f9a889b1 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/FastTerminatingSocketFactory.java @@ -0,0 +1,51 @@ +package nu.marginalia.wmsa.edge.crawling.retreival; + +import javax.net.SocketFactory; +import java.io.IOException; +import java.net.InetAddress; +import java.net.Socket; + +public class FastTerminatingSocketFactory extends SocketFactory { + private static final SocketFactory delegate = SocketFactory.getDefault(); + + private void configure(Socket sock) throws IOException { + // Setting SO_LINGER to enabled but low reduces TIME_WAIT + // which can get pretty... bad when you're crawling + // and opening thousands of connections + sock.setSoLinger(true, 3); + } + + public Socket createSocket() throws IOException { + var sock = delegate.createSocket(); + configure(sock); + return sock; + } + + @Override + public Socket createSocket(String host, int port) throws IOException { + var sock = delegate.createSocket(host, port); + configure(sock); + return sock; + } + + @Override + public Socket createSocket(String host, int port, InetAddress localHost, int localPort) throws IOException { + var sock = delegate.createSocket(host, port, localHost, localPort); + configure(sock); + return sock; + } + + @Override + public Socket createSocket(InetAddress host, int port) throws IOException { + var sock = delegate.createSocket(host, port); + configure(sock); + return sock; + } + + @Override + public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort) throws IOException { + var sock = delegate.createSocket(address, port, localAddress, localPort); + configure(sock); + return sock; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index 371548d5..e7ec01b2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -13,10 +13,7 @@ import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic; import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import okhttp3.Dispatcher; -import okhttp3.OkHttpClient; -import okhttp3.Request; -import okhttp3.Response; +import okhttp3.*; import org.apache.commons.io.input.BOMInputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -66,15 +63,18 @@ public class HttpFetcher { } } + private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory(); @SneakyThrows - private OkHttpClient createClient(Dispatcher dispatcher) { + private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) { var builder = new OkHttpClient.Builder(); if (dispatcher != null) { builder.dispatcher(dispatcher); } return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0]) + .socketFactory(ftSocketFactory) .hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer()) + .connectionPool(pool) .cookieJar(cookies.getJar()) .followRedirects(true) .followSslRedirects(true) @@ -82,6 +82,7 @@ public class HttpFetcher { .readTimeout(10, TimeUnit.SECONDS) .writeTimeout(10, TimeUnit.SECONDS) .build(); + } public List getCookies() { @@ -93,13 +94,13 @@ public class HttpFetcher { } @Inject - public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher) { - this.client = createClient(dispatcher); + public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) { + this.client = createClient(dispatcher, connectionPool); this.userAgent = userAgent; } public HttpFetcher(@Named("user-agent") String userAgent) { - this.client = createClient(null); + this.client = createClient(null, new ConnectionPool()); this.userAgent = userAgent; } @@ -262,8 +263,6 @@ public class HttpFetcher { } - - public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { return fetchRobotsForProto("https", domain) .or(() -> fetchRobotsForProto("http", domain)) @@ -286,4 +285,5 @@ public class HttpFetcher { doc.contentType, userAgent); } + }