(crawler) Clean up and refactor the code a bit
This commit is contained in:
parent
69f333c0bf
commit
35b29e4f9e
@ -4,6 +4,7 @@ import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.gson.Gson;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -25,15 +26,15 @@ public class CrawledDomainWriter implements AutoCloseable {
|
||||
private final Path tmpFile;
|
||||
private final Path outputFile;
|
||||
|
||||
public CrawledDomainWriter(Path outputDir, String name, String id) throws IOException {
|
||||
public CrawledDomainWriter(Path outputDir, CrawlingSpecification spec) throws IOException {
|
||||
this.outputDir = outputDir;
|
||||
|
||||
if (!Files.isDirectory(outputDir)) {
|
||||
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
|
||||
}
|
||||
|
||||
tmpFile = getOutputFile(id, name + "_tmp");
|
||||
outputFile = getOutputFile(id, name);
|
||||
tmpFile = getOutputFile(spec.id, spec.domain + "_tmp");
|
||||
outputFile = getOutputFile(spec.id, spec.domain);
|
||||
writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile,
|
||||
StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING))));
|
||||
}
|
||||
|
@ -199,7 +199,7 @@ public class CrawlerMain implements AutoCloseable {
|
||||
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
|
||||
|
||||
|
||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification)) {
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
|
||||
|
||||
CrawlDataReference reference = getReference(specification);
|
||||
|
@ -118,7 +118,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
|
||||
Path out = Files.createTempDirectory("crawling-process");
|
||||
var writer = new CrawledDomainWriter(out, specs.domain, specs.id);
|
||||
var writer = new CrawledDomainWriter(out, specs);
|
||||
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
||||
|
||||
new CrawlerRetreiver(httpFetcher, specs, d -> {
|
||||
|
Loading…
Reference in New Issue
Block a user