(crawler) Extract additional configuration properties

This commit extracts several previously hardcoded configuration properties, and makes then available through system.properties.

The documentation is updated to reflect the change.

Dead code was also removed in the process. CrawlSpecGenerator is left feeling a bit over-engineered still, since it's built for a more general case, where all other implementations but the current one are removed, but we'll leave it like this for now as it's fairly readable still.
This commit is contained in:
Viktor Lofgren 2024-01-20 10:36:04 +01:00
parent 2079a5574b
commit 91c7960800
4 changed files with 32 additions and 73 deletions

View File

@ -1,6 +1,5 @@
package nu.marginalia.crawlspec; package nu.marginalia.crawlspec;
import nu.marginalia.db.DbDomainStatsExportMultitool;
import nu.marginalia.io.crawlspec.CrawlSpecRecordParquetFileWriter; import nu.marginalia.io.crawlspec.CrawlSpecRecordParquetFileWriter;
import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.model.crawlspec.CrawlSpecRecord;
@ -8,12 +7,9 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List; import java.util.List;
public class CrawlSpecGenerator { public class CrawlSpecGenerator {
private static final int MIN_VISIT_COUNT = 200;
private static final int MAX_VISIT_COUNT = 100000;
public static void generateCrawlSpec(Path output, public static void generateCrawlSpec(Path output,
DomainSource domains, DomainSource domains,
@ -28,9 +24,7 @@ public class CrawlSpecGenerator {
writer.write(CrawlSpecRecord writer.write(CrawlSpecRecord
.builder() .builder()
.crawlDepth(calculateCrawlDepthFromVisitedCount( .crawlDepth(counts.getKnownUrlCount(domain))
counts.getKnownUrlCount(domain)
))
.urls(listSource.getKnownUrls(domain)) .urls(listSource.getKnownUrls(domain))
.domain(domain) .domain(domain)
.build()); .build());
@ -38,48 +32,9 @@ public class CrawlSpecGenerator {
} }
} }
private static int calculateCrawlDepthFromVisitedCount(int count) {
if (count < MIN_VISIT_COUNT / 2) {
/* If we aren't getting very many good documents
out of this webpage on previous attempts, we
won't dig very deeply this time. This saves time
and resources for both the crawler and the server,
and also prevents deep crawls on foreign websites we aren't
interested in crawling at this point. */
count = MIN_VISIT_COUNT;
}
else {
/* If we got many results previously, we'll
dig deeper with each successive crawl. */
count = count + 1000 + count / 4;
}
if (count > MAX_VISIT_COUNT) {
count = MAX_VISIT_COUNT;
}
return count;
}
public interface DomainSource { public interface DomainSource {
List<String> getDomainNames() throws IOException, SQLException; List<String> getDomainNames() throws IOException, SQLException;
static DomainSource combined(DomainSource... sources) {
if (sources.length == 0) {
return List::of;
}
return () -> {
List<String> combined = new ArrayList<>(sources[0].getDomainNames());
for (int i = 1; i < sources.length; i++) {
combined.addAll(sources[i].getDomainNames());
}
return combined;
};
}
static DomainSource fromFile(Path file) { static DomainSource fromFile(Path file) {
return () -> { return () -> {
var lines = Files.readAllLines(file); var lines = Files.readAllLines(file);
@ -93,13 +48,6 @@ public class CrawlSpecGenerator {
}; };
} }
static DomainSource knownUrlsFromDb(DbDomainStatsExportMultitool dbData) {
return dbData::getAllIndexedDomains;
}
static DomainSource fromCrawlQueue(DbDomainStatsExportMultitool dbData) {
return dbData::getCrawlQueueDomains;
}
} }
public interface KnownUrlsCountSource { public interface KnownUrlsCountSource {
@ -108,12 +56,6 @@ public class CrawlSpecGenerator {
static KnownUrlsCountSource fixed(int value) { static KnownUrlsCountSource fixed(int value) {
return domainName -> value; return domainName -> value;
} }
static KnownUrlsCountSource fromDb(DbDomainStatsExportMultitool dbData, int defaultValue) {
return domainName ->
dbData.getVisitedUrls(domainName)
.orElse(defaultValue);
}
} }
public interface KnownUrlsListSource { public interface KnownUrlsListSource {

View File

@ -20,6 +20,10 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
private static final Logger logger = LoggerFactory.getLogger(DbCrawlSpecProvider.class); private static final Logger logger = LoggerFactory.getLogger(DbCrawlSpecProvider.class);
private static final double URL_GROWTH_FACTOR = Double.parseDouble(System.getProperty("crawler.crawlSetGrowthFactor", "1.25"));
private static final int MIN_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 100);
private static final int MAX_URLS_PER_DOMAIN = Integer.getInteger("crawler.maxUrlsPerDomain", 10_000);
@Inject @Inject
public DbCrawlSpecProvider(HikariDataSource dataSource, public DbCrawlSpecProvider(HikariDataSource dataSource,
ProcessConfiguration processConfiguration ProcessConfiguration processConfiguration
@ -48,7 +52,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
while (rs.next()) { while (rs.next()) {
domains.add(new CrawlSpecRecord( domains.add(new CrawlSpecRecord(
rs.getString(1), rs.getString(1),
Math.clamp((int) (1.25 * rs.getInt(2)), 250, 10_000), Math.clamp((int) (URL_GROWTH_FACTOR * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN),
List.of() List.of()
)); ));
} }

View File

@ -12,6 +12,7 @@ import nu.marginalia.storage.model.FileStorageType;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URL; import java.net.URL;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@ -22,9 +23,10 @@ import static nu.marginalia.crawlspec.CrawlSpecGenerator.*;
@Singleton @Singleton
public class CrawlJobExtractorActor extends RecordActorPrototype { public class CrawlJobExtractorActor extends RecordActorPrototype {
private static final int INITIAL_URLS_PER_DOMAIN = Integer.getInteger("crawler.initialUrlsPerDomain", 250);
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final FileStorageService fileStorageService; private final FileStorageService fileStorageService;
@Inject @Inject
public CrawlJobExtractorActor(Gson gson, public CrawlJobExtractorActor(Gson gson,
FileStorageService fileStorageService FileStorageService fileStorageService
@ -44,12 +46,10 @@ public class CrawlJobExtractorActor extends RecordActorPrototype {
Path urlsTxt = storage.asPath().resolve("urls.txt"); Path urlsTxt = storage.asPath().resolve("urls.txt");
try (var os = Files.newOutputStream(urlsTxt, StandardOpenOption.CREATE_NEW); try {
var is = new URL(url).openStream()) downloadToFile(url, urlsTxt);
{
is.transferTo(os);
} }
catch (Exception ex) { catch (IOException ex) {
fileStorageService.flagFileForDeletion(storage.id()); fileStorageService.flagFileForDeletion(storage.id());
yield new Error("Error downloading " + url); yield new Error("Error downloading " + url);
} }
@ -59,7 +59,7 @@ public class CrawlJobExtractorActor extends RecordActorPrototype {
generateCrawlSpec( generateCrawlSpec(
path, path,
DomainSource.fromFile(urlsTxt), DomainSource.fromFile(urlsTxt),
KnownUrlsCountSource.fixed(200), KnownUrlsCountSource.fixed(INITIAL_URLS_PER_DOMAIN),
KnownUrlsListSource.justIndex() KnownUrlsListSource.justIndex()
); );
@ -69,6 +69,14 @@ public class CrawlJobExtractorActor extends RecordActorPrototype {
}; };
} }
private void downloadToFile(String url, Path urlsTxt) throws IOException {
try (var os = Files.newOutputStream(urlsTxt, StandardOpenOption.CREATE_NEW);
var is = new URL(url).openStream())
{
is.transferTo(os);
}
}
@Override @Override
public String describe() { public String describe() {
return "Run the crawler job extractor process"; return "Run the crawler job extractor process";

View File

@ -7,6 +7,7 @@ The system will look for a properties file in `conf/properties/system.properties
within the install dir, as specified by `$WMSA_HOME`. within the install dir, as specified by `$WMSA_HOME`.
A template is available in [../run/template/conf/properties/system.properties](../run/template/conf/properties/system.properties). A template is available in [../run/template/conf/properties/system.properties](../run/template/conf/properties/system.properties).
## Global ## Global
| flag | values | description | | flag | values | description |
@ -16,12 +17,16 @@ A template is available in [../run/template/conf/properties/system.properties](.
## Crawler Properties ## Crawler Properties
| flag | values | description | | flag | values | description |
|-----------------------------|------------|------------------------------------------------------------------------------------------| |------------------------------|------------|---------------------------------------------------------------------------------------------|
| crawler.userAgentString | string | Sets the user agent string used by the crawler | | crawler.userAgentString | string | Sets the user agent string used by the crawler |
| crawler.userAgentIdentifier | string | Sets the user agent identifier used by the crawler, e.g. what it looks for in robots.txt | | crawler.userAgentIdentifier | string | Sets the user agent identifier used by the crawler, e.g. what it looks for in robots.txt |
| crawler.poolSize | integer | Sets the number of threads used by the crawler, more is faster, but uses more RAM | | crawler.poolSize | integer | Sets the number of threads used by the crawler, more is faster, but uses more RAM |
| ip-blocklist.disabled | boolean | Disables the IP blocklist | | crawler.initialUrlsPerDomain | integer | Sets the initial number of URLs to crawl per domain (when crawling from spec) |
| crawler.maxUrlsPerDomain | integer | Sets the maximum number of URLs to crawl per domain (when recrawling) |
| crawler.minUrlsPerDomain | integer | Sets the minimum number of URLs to crawl per domain (when recrawling) |
| crawler.crawlSetGrowthFactor | double | If 100 documents were fetched last crawl, increase the goal to 100 x (this value) this time |
| ip-blocklist.disabled | boolean | Disables the IP blocklist |
## Converter Properties ## Converter Properties