diff --git a/code/process-models/crawl-spec/src/main/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java b/code/process-models/crawl-spec/src/main/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java index a8140232..abd831e5 100644 --- a/code/process-models/crawl-spec/src/main/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java +++ b/code/process-models/crawl-spec/src/main/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java @@ -1,6 +1,5 @@ package nu.marginalia.crawlspec; -import nu.marginalia.db.DbDomainStatsExportMultitool; import nu.marginalia.io.crawlspec.CrawlSpecRecordParquetFileWriter; import nu.marginalia.model.crawlspec.CrawlSpecRecord; @@ -8,12 +7,9 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; -import java.util.ArrayList; import java.util.List; public class CrawlSpecGenerator { - private static final int MIN_VISIT_COUNT = 200; - private static final int MAX_VISIT_COUNT = 100000; public static void generateCrawlSpec(Path output, DomainSource domains, @@ -28,9 +24,7 @@ public class CrawlSpecGenerator { writer.write(CrawlSpecRecord .builder() - .crawlDepth(calculateCrawlDepthFromVisitedCount( - counts.getKnownUrlCount(domain) - )) + .crawlDepth(counts.getKnownUrlCount(domain)) .urls(listSource.getKnownUrls(domain)) .domain(domain) .build()); @@ -38,48 +32,9 @@ public class CrawlSpecGenerator { } } - private static int calculateCrawlDepthFromVisitedCount(int count) { - if (count < MIN_VISIT_COUNT / 2) { - /* If we aren't getting very many good documents - out of this webpage on previous attempts, we - won't dig very deeply this time. This saves time - and resources for both the crawler and the server, - and also prevents deep crawls on foreign websites we aren't - interested in crawling at this point. */ - count = MIN_VISIT_COUNT; - } - else { - /* If we got many results previously, we'll - dig deeper with each successive crawl. */ - count = count + 1000 + count / 4; - } - - if (count > MAX_VISIT_COUNT) { - count = MAX_VISIT_COUNT; - } - - return count; - } - public interface DomainSource { List getDomainNames() throws IOException, SQLException; - static DomainSource combined(DomainSource... sources) { - if (sources.length == 0) { - return List::of; - } - - return () -> { - List combined = new ArrayList<>(sources[0].getDomainNames()); - - for (int i = 1; i < sources.length; i++) { - combined.addAll(sources[i].getDomainNames()); - } - - return combined; - }; - } - static DomainSource fromFile(Path file) { return () -> { var lines = Files.readAllLines(file); @@ -93,13 +48,6 @@ public class CrawlSpecGenerator { }; } - static DomainSource knownUrlsFromDb(DbDomainStatsExportMultitool dbData) { - return dbData::getAllIndexedDomains; - } - - static DomainSource fromCrawlQueue(DbDomainStatsExportMultitool dbData) { - return dbData::getCrawlQueueDomains; - } } public interface KnownUrlsCountSource { @@ -108,12 +56,6 @@ public class CrawlSpecGenerator { static KnownUrlsCountSource fixed(int value) { return domainName -> value; } - - static KnownUrlsCountSource fromDb(DbDomainStatsExportMultitool dbData, int defaultValue) { - return domainName -> - dbData.getVisitedUrls(domainName) - .orElse(defaultValue); - } } public interface KnownUrlsListSource { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java index c0dfda01..756b4dd8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java @@ -20,6 +20,10 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider { private static final Logger logger = LoggerFactory.getLogger(DbCrawlSpecProvider.class); + private static final double URL_GROWTH_FACTOR = Double.parseDouble(System.getProperty("crawler.crawlSetGrowthFactor", "1.25")); + private static final int MIN_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 100); + private static final int MAX_URLS_PER_DOMAIN = Integer.getInteger("crawler.maxUrlsPerDomain", 10_000); + @Inject public DbCrawlSpecProvider(HikariDataSource dataSource, ProcessConfiguration processConfiguration @@ -48,7 +52,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider { while (rs.next()) { domains.add(new CrawlSpecRecord( rs.getString(1), - Math.clamp((int) (1.25 * rs.getInt(2)), 250, 10_000), + Math.clamp((int) (URL_GROWTH_FACTOR * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN), List.of() )); } diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java index b3e5ac6e..faba7b05 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java @@ -12,6 +12,7 @@ import nu.marginalia.storage.model.FileStorageType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; @@ -22,9 +23,10 @@ import static nu.marginalia.crawlspec.CrawlSpecGenerator.*; @Singleton public class CrawlJobExtractorActor extends RecordActorPrototype { + private static final int INITIAL_URLS_PER_DOMAIN = Integer.getInteger("crawler.initialUrlsPerDomain", 250); private final Logger logger = LoggerFactory.getLogger(getClass()); - private final FileStorageService fileStorageService; + @Inject public CrawlJobExtractorActor(Gson gson, FileStorageService fileStorageService @@ -44,12 +46,10 @@ public class CrawlJobExtractorActor extends RecordActorPrototype { Path urlsTxt = storage.asPath().resolve("urls.txt"); - try (var os = Files.newOutputStream(urlsTxt, StandardOpenOption.CREATE_NEW); - var is = new URL(url).openStream()) - { - is.transferTo(os); + try { + downloadToFile(url, urlsTxt); } - catch (Exception ex) { + catch (IOException ex) { fileStorageService.flagFileForDeletion(storage.id()); yield new Error("Error downloading " + url); } @@ -59,7 +59,7 @@ public class CrawlJobExtractorActor extends RecordActorPrototype { generateCrawlSpec( path, DomainSource.fromFile(urlsTxt), - KnownUrlsCountSource.fixed(200), + KnownUrlsCountSource.fixed(INITIAL_URLS_PER_DOMAIN), KnownUrlsListSource.justIndex() ); @@ -69,6 +69,14 @@ public class CrawlJobExtractorActor extends RecordActorPrototype { }; } + private void downloadToFile(String url, Path urlsTxt) throws IOException { + try (var os = Files.newOutputStream(urlsTxt, StandardOpenOption.CREATE_NEW); + var is = new URL(url).openStream()) + { + is.transferTo(os); + } + } + @Override public String describe() { return "Run the crawler job extractor process"; diff --git a/doc/system-properties.md b/doc/system-properties.md index dad505ba..0c825e7b 100644 --- a/doc/system-properties.md +++ b/doc/system-properties.md @@ -7,6 +7,7 @@ The system will look for a properties file in `conf/properties/system.properties within the install dir, as specified by `$WMSA_HOME`. A template is available in [../run/template/conf/properties/system.properties](../run/template/conf/properties/system.properties). + ## Global | flag | values | description | @@ -16,12 +17,16 @@ A template is available in [../run/template/conf/properties/system.properties](. ## Crawler Properties -| flag | values | description | -|-----------------------------|------------|------------------------------------------------------------------------------------------| -| crawler.userAgentString | string | Sets the user agent string used by the crawler | -| crawler.userAgentIdentifier | string | Sets the user agent identifier used by the crawler, e.g. what it looks for in robots.txt | -| crawler.poolSize | integer | Sets the number of threads used by the crawler, more is faster, but uses more RAM | -| ip-blocklist.disabled | boolean | Disables the IP blocklist | +| flag | values | description | +|------------------------------|------------|---------------------------------------------------------------------------------------------| +| crawler.userAgentString | string | Sets the user agent string used by the crawler | +| crawler.userAgentIdentifier | string | Sets the user agent identifier used by the crawler, e.g. what it looks for in robots.txt | +| crawler.poolSize | integer | Sets the number of threads used by the crawler, more is faster, but uses more RAM | +| crawler.initialUrlsPerDomain | integer | Sets the initial number of URLs to crawl per domain (when crawling from spec) | +| crawler.maxUrlsPerDomain | integer | Sets the maximum number of URLs to crawl per domain (when recrawling) | +| crawler.minUrlsPerDomain | integer | Sets the minimum number of URLs to crawl per domain (when recrawling) | +| crawler.crawlSetGrowthFactor | double | If 100 documents were fetched last crawl, increase the goal to 100 x (this value) this time | +| ip-blocklist.disabled | boolean | Disables the IP blocklist | ## Converter Properties