(crawler) Even more memory optimizations.

* Fix minor resource leak in zstd streams
* Use pools for zstd streams
* Reduce the SSL session cache size
This commit is contained in:
Viktor Lofgren 2023-07-30 14:19:55 +02:00
parent aba134284f
commit 730e8f74e4
9 changed files with 55 additions and 19 deletions

View File

@ -1,5 +1,6 @@
package nu.marginalia.crawling.io;
import com.github.luben.zstd.RecyclingBufferPool;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import nu.marginalia.crawling.model.CrawledDocument;
@ -37,7 +38,7 @@ public class CrawledDomainReader {
public CrawledDomain read(Path path) throws IOException {
DomainDataAssembler domainData = new DomainDataAssembler();
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()), RecyclingBufferPool.INSTANCE)))) {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("//")) {
@ -105,7 +106,7 @@ public class CrawledDomainReader {
public FileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException {
this.gson = gson;
bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file))));
bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
}
@Override
@ -124,9 +125,15 @@ public class CrawledDomainReader {
return true;
String identifier = bufferedReader.readLine();
if (identifier == null) return false;
if (identifier == null) {
bufferedReader.close();
return false;
}
String data = bufferedReader.readLine();
if (data == null) return false;
if (data == null) {
bufferedReader.close();
return false;
}
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
next = gson.fromJson(data, CrawledDomain.class);

View File

@ -1,5 +1,6 @@
package nu.marginalia.crawling.io;
import com.github.luben.zstd.RecyclingBufferPool;
import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson;
import lombok.SneakyThrows;
@ -38,7 +39,8 @@ public class CrawledDomainWriter implements AutoCloseable {
tmpFile = getOutputFile(spec.id, spec.domain + "_tmp");
actualFile = getOutputFile(spec.id, spec.domain);
writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile,
StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING))));
StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)),
RecyclingBufferPool.INSTANCE));
}
public Path getOutputFile() {

View File

@ -8,7 +8,7 @@ import java.util.Iterator;
/** Closable iterator over serialized crawl data
* The data may appear in any order, and the iterator must be closed.
* */
public interface SerializableCrawlDataStream {
public interface SerializableCrawlDataStream extends AutoCloseable {
static SerializableCrawlDataStream empty() {
return new SerializableCrawlDataStream() {
@Override
@ -20,6 +20,8 @@ public interface SerializableCrawlDataStream {
public boolean hasNext() throws IOException {
return false;
}
public void close() {}
};
}
@ -35,6 +37,8 @@ public interface SerializableCrawlDataStream {
public boolean hasNext() throws IOException {
return iterator.hasNext();
}
public void close() {}
};
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.crawling.model.spec;
import com.github.luben.zstd.RecyclingBufferPool;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import com.google.gson.JsonStreamParser;
@ -17,7 +18,8 @@ public class CrawlerSpecificationLoader {
@SneakyThrows
public static Iterable<CrawlingSpecification> asIterable(Path inputSpec) {
var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile()))));
var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile()),
RecyclingBufferPool.INSTANCE)));
var parser = new JsonStreamParser(inputStream);
return () -> new Iterator<>() {

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting;
import com.github.luben.zstd.RecyclingBufferPool;
import com.github.luben.zstd.ZstdOutputStream;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.idx.DocumentMetadata;
@ -27,8 +28,7 @@ public class ConversionLog implements AutoCloseable, Interpreter {
String fileName = String.format("conversion-log-%s.zstd", LocalDateTime.now().toEpochSecond(ZoneOffset.UTC));
Path logFile = rootDir.resolve(fileName);
writer = new PrintWriter(new ZstdOutputStream(
new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE))));
writer = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE)), RecyclingBufferPool.INSTANCE));
}
@Override

View File

@ -55,7 +55,7 @@ public class CrawlerMain implements AutoCloseable {
private final Gson gson;
private final DumbThreadPool pool;
private final Set<String> processedIds = new HashSet<>();
private final Set<String> processingIds = new HashSet<>();
final AbortMonitor abortMonitor = AbortMonitor.getInstance();
@ -92,6 +92,9 @@ public class CrawlerMain implements AutoCloseable {
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
// We don't want to use too much memory caching sessions for https
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
Injector injector = Guice.createInjector(
new CrawlerModule(),
new DatabaseModule()
@ -154,7 +157,7 @@ public class CrawlerMain implements AutoCloseable {
private void startCrawlTask(CrawlPlan plan, CrawlingSpecification crawlingSpecification) {
if (!processedIds.add(crawlingSpecification.id)) {
if (workLog.isJobFinished(crawlingSpecification.id) || !processingIds.add(crawlingSpecification.id)) {
// This is a duplicate id, so we ignore it. Otherwise we'd end crawling the same site twice,
// and if we're really unlucky, we might end up writing to the same output file from multiple
@ -193,11 +196,10 @@ public class CrawlerMain implements AutoCloseable {
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification)) {
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification);
CrawlDataReference reference = getReference(specification))
{
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
CrawlDataReference reference = getReference(specification);
int size = retreiver.fetch(reference);
workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size);
@ -206,6 +208,10 @@ public class CrawlerMain implements AutoCloseable {
} catch (Exception e) {
logger.error("Error fetching domain " + specification.domain, e);
}
finally {
// We don't need to double-count these; it's also kept int he workLog
processingIds.remove(specification.id);
}
}
private CrawlDataReference getReference(CrawlingSpecification specification) {

View File

@ -10,7 +10,7 @@ import javax.annotation.Nullable;
import java.io.IOException;
/** A reference to a domain that has been crawled before. */
public class CrawlDataReference {
public class CrawlDataReference implements AutoCloseable {
private final SerializableCrawlDataStream data;
@ -75,4 +75,8 @@ public class CrawlDataReference {
return hashFunction.hashInt(v).asInt();
}
@Override
public void close() throws Exception {
data.close();
}
}

View File

@ -33,7 +33,17 @@ public class NoSecuritySSL {
// Install the all-trusting trust manager
final SSLContext sslContext = SSLContext.getInstance("SSL");
sslContext.init(null, trustAllCerts, new java.security.SecureRandom());
// Create an ssl socket factory with our all-trusting manager
var clientSessionContext = sslContext.getClientSessionContext();
System.out.println("Default session cache size: " + clientSessionContext.getSessionCacheSize());
System.out.println("Session timeout: " + clientSessionContext.getSessionTimeout());
// The default value for this is very high and will use a crapload of memory
// since the crawler will be making a lot of requests to various hosts
clientSessionContext.setSessionCacheSize(2048);
// Create a ssl socket factory with our all-trusting manager
return sslContext.getSocketFactory();
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.loading;
import com.github.luben.zstd.RecyclingBufferPool;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import lombok.SneakyThrows;
@ -26,7 +27,7 @@ public class ConvertedDomainReader {
public List<Instruction> read(Path path, int cntHint) throws IOException {
List<Instruction> ret = new ArrayList<>(cntHint);
try (var or = new ObjectInputStream(new ZstdInputStream(new FileInputStream(path.toFile())))) {
try (var or = new ObjectInputStream(new ZstdInputStream(new FileInputStream(path.toFile()), RecyclingBufferPool.INSTANCE))) {
var object = or.readObject();
if (object instanceof Instruction is) {
ret.add(is);
@ -39,7 +40,7 @@ public class ConvertedDomainReader {
}
public Iterator<Instruction> createIterator(Path path) throws IOException {
var or = new ObjectInputStream(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile()))));
var or = new ObjectInputStream(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())), RecyclingBufferPool.INSTANCE));
return new Iterator<>() {
Instruction next;