(crawler) Clean up and refactor the code a bit

This commit is contained in:
Viktor Lofgren 2023-07-23 19:06:37 +02:00
parent 69f333c0bf
commit 35b29e4f9e
3 changed files with 6 additions and 5 deletions

View File

@ -4,6 +4,7 @@ import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson;
import lombok.SneakyThrows;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.model.gson.GsonFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -25,15 +26,15 @@ public class CrawledDomainWriter implements AutoCloseable {
private final Path tmpFile;
private final Path outputFile;
public CrawledDomainWriter(Path outputDir, String name, String id) throws IOException {
public CrawledDomainWriter(Path outputDir, CrawlingSpecification spec) throws IOException {
this.outputDir = outputDir;
if (!Files.isDirectory(outputDir)) {
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
}
tmpFile = getOutputFile(id, name + "_tmp");
outputFile = getOutputFile(id, name);
tmpFile = getOutputFile(spec.id, spec.domain + "_tmp");
outputFile = getOutputFile(spec.id, spec.domain);
writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile,
StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING))));
}

View File

@ -199,7 +199,7 @@ public class CrawlerMain implements AutoCloseable {
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification)) {
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
CrawlDataReference reference = getReference(specification);

View File

@ -118,7 +118,7 @@ class CrawlerRetreiverTest {
Path out = Files.createTempDirectory("crawling-process");
var writer = new CrawledDomainWriter(out, specs.domain, specs.id);
var writer = new CrawledDomainWriter(out, specs);
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
new CrawlerRetreiver(httpFetcher, specs, d -> {