From 91c7960800e66609c38a4b40c3accc562d0e5d14 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 20 Jan 2024 10:36:04 +0100 Subject: [PATCH] (crawler) Extract additional configuration properties This commit extracts several previously hardcoded configuration properties, and makes then available through system.properties. The documentation is updated to reflect the change. Dead code was also removed in the process. CrawlSpecGenerator is left feeling a bit over-engineered still, since it's built for a more general case, where all other implementations but the current one are removed, but we'll leave it like this for now as it's fairly readable still. --- .../crawlspec/CrawlSpecGenerator.java | 60 +------------------ .../crawl/spec/DbCrawlSpecProvider.java | 6 +- .../actor/task/CrawlJobExtractorActor.java | 22 ++++--- doc/system-properties.md | 17 ++++-- 4 files changed, 32 insertions(+), 73 deletions(-) diff --git a/code/process-models/crawl-spec/src/main/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java b/code/process-models/crawl-spec/src/main/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java index a8140232..abd831e5 100644 --- a/code/process-models/crawl-spec/src/main/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java +++ b/code/process-models/crawl-spec/src/main/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java @@ -1,6 +1,5 @@ package nu.marginalia.crawlspec; -import nu.marginalia.db.DbDomainStatsExportMultitool; import nu.marginalia.io.crawlspec.CrawlSpecRecordParquetFileWriter; import nu.marginalia.model.crawlspec.CrawlSpecRecord; @@ -8,12 +7,9 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; -import java.util.ArrayList; import java.util.List; public class CrawlSpecGenerator { - private static final int MIN_VISIT_COUNT = 200; - private static final int MAX_VISIT_COUNT = 100000; public static void generateCrawlSpec(Path output, DomainSource domains, @@ -28,9 +24,7 @@ public class CrawlSpecGenerator { writer.write(CrawlSpecRecord .builder() - .crawlDepth(calculateCrawlDepthFromVisitedCount( - counts.getKnownUrlCount(domain) - )) + .crawlDepth(counts.getKnownUrlCount(domain)) .urls(listSource.getKnownUrls(domain)) .domain(domain) .build()); @@ -38,48 +32,9 @@ public class CrawlSpecGenerator { } } - private static int calculateCrawlDepthFromVisitedCount(int count) { - if (count < MIN_VISIT_COUNT / 2) { - /* If we aren't getting very many good documents - out of this webpage on previous attempts, we - won't dig very deeply this time. This saves time - and resources for both the crawler and the server, - and also prevents deep crawls on foreign websites we aren't - interested in crawling at this point. */ - count = MIN_VISIT_COUNT; - } - else { - /* If we got many results previously, we'll - dig deeper with each successive crawl. */ - count = count + 1000 + count / 4; - } - - if (count > MAX_VISIT_COUNT) { - count = MAX_VISIT_COUNT; - } - - return count; - } - public interface DomainSource { List getDomainNames() throws IOException, SQLException; - static DomainSource combined(DomainSource... sources) { - if (sources.length == 0) { - return List::of; - } - - return () -> { - List combined = new ArrayList<>(sources[0].getDomainNames()); - - for (int i = 1; i < sources.length; i++) { - combined.addAll(sources[i].getDomainNames()); - } - - return combined; - }; - } - static DomainSource fromFile(Path file) { return () -> { var lines = Files.readAllLines(file); @@ -93,13 +48,6 @@ public class CrawlSpecGenerator { }; } - static DomainSource knownUrlsFromDb(DbDomainStatsExportMultitool dbData) { - return dbData::getAllIndexedDomains; - } - - static DomainSource fromCrawlQueue(DbDomainStatsExportMultitool dbData) { - return dbData::getCrawlQueueDomains; - } } public interface KnownUrlsCountSource { @@ -108,12 +56,6 @@ public class CrawlSpecGenerator { static KnownUrlsCountSource fixed(int value) { return domainName -> value; } - - static KnownUrlsCountSource fromDb(DbDomainStatsExportMultitool dbData, int defaultValue) { - return domainName -> - dbData.getVisitedUrls(domainName) - .orElse(defaultValue); - } } public interface KnownUrlsListSource { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java index c0dfda01..756b4dd8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java @@ -20,6 +20,10 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider { private static final Logger logger = LoggerFactory.getLogger(DbCrawlSpecProvider.class); + private static final double URL_GROWTH_FACTOR = Double.parseDouble(System.getProperty("crawler.crawlSetGrowthFactor", "1.25")); + private static final int MIN_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 100); + private static final int MAX_URLS_PER_DOMAIN = Integer.getInteger("crawler.maxUrlsPerDomain", 10_000); + @Inject public DbCrawlSpecProvider(HikariDataSource dataSource, ProcessConfiguration processConfiguration @@ -48,7 +52,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider { while (rs.next()) { domains.add(new CrawlSpecRecord( rs.getString(1), - Math.clamp((int) (1.25 * rs.getInt(2)), 250, 10_000), + Math.clamp((int) (URL_GROWTH_FACTOR * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN), List.of() )); } diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java index b3e5ac6e..faba7b05 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java @@ -12,6 +12,7 @@ import nu.marginalia.storage.model.FileStorageType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; @@ -22,9 +23,10 @@ import static nu.marginalia.crawlspec.CrawlSpecGenerator.*; @Singleton public class CrawlJobExtractorActor extends RecordActorPrototype { + private static final int INITIAL_URLS_PER_DOMAIN = Integer.getInteger("crawler.initialUrlsPerDomain", 250); private final Logger logger = LoggerFactory.getLogger(getClass()); - private final FileStorageService fileStorageService; + @Inject public CrawlJobExtractorActor(Gson gson, FileStorageService fileStorageService @@ -44,12 +46,10 @@ public class CrawlJobExtractorActor extends RecordActorPrototype { Path urlsTxt = storage.asPath().resolve("urls.txt"); - try (var os = Files.newOutputStream(urlsTxt, StandardOpenOption.CREATE_NEW); - var is = new URL(url).openStream()) - { - is.transferTo(os); + try { + downloadToFile(url, urlsTxt); } - catch (Exception ex) { + catch (IOException ex) { fileStorageService.flagFileForDeletion(storage.id()); yield new Error("Error downloading " + url); } @@ -59,7 +59,7 @@ public class CrawlJobExtractorActor extends RecordActorPrototype { generateCrawlSpec( path, DomainSource.fromFile(urlsTxt), - KnownUrlsCountSource.fixed(200), + KnownUrlsCountSource.fixed(INITIAL_URLS_PER_DOMAIN), KnownUrlsListSource.justIndex() ); @@ -69,6 +69,14 @@ public class CrawlJobExtractorActor extends RecordActorPrototype { }; } + private void downloadToFile(String url, Path urlsTxt) throws IOException { + try (var os = Files.newOutputStream(urlsTxt, StandardOpenOption.CREATE_NEW); + var is = new URL(url).openStream()) + { + is.transferTo(os); + } + } + @Override public String describe() { return "Run the crawler job extractor process"; diff --git a/doc/system-properties.md b/doc/system-properties.md index dad505ba..0c825e7b 100644 --- a/doc/system-properties.md +++ b/doc/system-properties.md @@ -7,6 +7,7 @@ The system will look for a properties file in `conf/properties/system.properties within the install dir, as specified by `$WMSA_HOME`. A template is available in [../run/template/conf/properties/system.properties](../run/template/conf/properties/system.properties). + ## Global | flag | values | description | @@ -16,12 +17,16 @@ A template is available in [../run/template/conf/properties/system.properties](. ## Crawler Properties -| flag | values | description | -|-----------------------------|------------|------------------------------------------------------------------------------------------| -| crawler.userAgentString | string | Sets the user agent string used by the crawler | -| crawler.userAgentIdentifier | string | Sets the user agent identifier used by the crawler, e.g. what it looks for in robots.txt | -| crawler.poolSize | integer | Sets the number of threads used by the crawler, more is faster, but uses more RAM | -| ip-blocklist.disabled | boolean | Disables the IP blocklist | +| flag | values | description | +|------------------------------|------------|---------------------------------------------------------------------------------------------| +| crawler.userAgentString | string | Sets the user agent string used by the crawler | +| crawler.userAgentIdentifier | string | Sets the user agent identifier used by the crawler, e.g. what it looks for in robots.txt | +| crawler.poolSize | integer | Sets the number of threads used by the crawler, more is faster, but uses more RAM | +| crawler.initialUrlsPerDomain | integer | Sets the initial number of URLs to crawl per domain (when crawling from spec) | +| crawler.maxUrlsPerDomain | integer | Sets the maximum number of URLs to crawl per domain (when recrawling) | +| crawler.minUrlsPerDomain | integer | Sets the minimum number of URLs to crawl per domain (when recrawling) | +| crawler.crawlSetGrowthFactor | double | If 100 documents were fetched last crawl, increase the goal to 100 x (this value) this time | +| ip-blocklist.disabled | boolean | Disables the IP blocklist | ## Converter Properties