(convert) Add sizeHints to legacy serializable cawl data stream

This reduces the maximum memory usage when processing legacy crawl data
This commit is contained in:
Viktor Lofgren 2024-01-13 15:50:36 +01:00
parent ecd9c35233
commit 176b9c9666

View File

@ -26,6 +26,8 @@ public class CompatibleLegacySerializableCrawlDataStream implements AutoCloseabl
private SerializableCrawlData next;
private final Path path;
private int sizeHint;
public CompatibleLegacySerializableCrawlDataStream(Gson gson, File file) throws IOException {
this.gson = gson;
path = file.toPath();
@ -34,10 +36,15 @@ public class CompatibleLegacySerializableCrawlDataStream implements AutoCloseabl
bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
}
@Override
public int sizeHint() {
return sizeHint;
}
/** Scan through the file and find the domain record */
private CrawledDomain findDomain(File file) throws IOException {
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)))) {
for (;;) {
for (;;sizeHint++) {
String identifierLine =
requireNonNull(br.readLine(), "No identifier line found");
String dataLine =