(crawler) Extract additional configuration properties
This commit extracts several previously hardcoded configuration properties, and makes then available through system.properties. The documentation is updated to reflect the change. Dead code was also removed in the process. CrawlSpecGenerator is left feeling a bit over-engineered still, since it's built for a more general case, where all other implementations but the current one are removed, but we'll leave it like this for now as it's fairly readable still.
This commit is contained in:
parent
2079a5574b
commit
91c7960800
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.crawlspec;
|
||||
|
||||
import nu.marginalia.db.DbDomainStatsExportMultitool;
|
||||
import nu.marginalia.io.crawlspec.CrawlSpecRecordParquetFileWriter;
|
||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||
|
||||
@ -8,12 +7,9 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class CrawlSpecGenerator {
|
||||
private static final int MIN_VISIT_COUNT = 200;
|
||||
private static final int MAX_VISIT_COUNT = 100000;
|
||||
|
||||
public static void generateCrawlSpec(Path output,
|
||||
DomainSource domains,
|
||||
@ -28,9 +24,7 @@ public class CrawlSpecGenerator {
|
||||
|
||||
writer.write(CrawlSpecRecord
|
||||
.builder()
|
||||
.crawlDepth(calculateCrawlDepthFromVisitedCount(
|
||||
counts.getKnownUrlCount(domain)
|
||||
))
|
||||
.crawlDepth(counts.getKnownUrlCount(domain))
|
||||
.urls(listSource.getKnownUrls(domain))
|
||||
.domain(domain)
|
||||
.build());
|
||||
@ -38,48 +32,9 @@ public class CrawlSpecGenerator {
|
||||
}
|
||||
}
|
||||
|
||||
private static int calculateCrawlDepthFromVisitedCount(int count) {
|
||||
if (count < MIN_VISIT_COUNT / 2) {
|
||||
/* If we aren't getting very many good documents
|
||||
out of this webpage on previous attempts, we
|
||||
won't dig very deeply this time. This saves time
|
||||
and resources for both the crawler and the server,
|
||||
and also prevents deep crawls on foreign websites we aren't
|
||||
interested in crawling at this point. */
|
||||
count = MIN_VISIT_COUNT;
|
||||
}
|
||||
else {
|
||||
/* If we got many results previously, we'll
|
||||
dig deeper with each successive crawl. */
|
||||
count = count + 1000 + count / 4;
|
||||
}
|
||||
|
||||
if (count > MAX_VISIT_COUNT) {
|
||||
count = MAX_VISIT_COUNT;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
public interface DomainSource {
|
||||
List<String> getDomainNames() throws IOException, SQLException;
|
||||
|
||||
static DomainSource combined(DomainSource... sources) {
|
||||
if (sources.length == 0) {
|
||||
return List::of;
|
||||
}
|
||||
|
||||
return () -> {
|
||||
List<String> combined = new ArrayList<>(sources[0].getDomainNames());
|
||||
|
||||
for (int i = 1; i < sources.length; i++) {
|
||||
combined.addAll(sources[i].getDomainNames());
|
||||
}
|
||||
|
||||
return combined;
|
||||
};
|
||||
}
|
||||
|
||||
static DomainSource fromFile(Path file) {
|
||||
return () -> {
|
||||
var lines = Files.readAllLines(file);
|
||||
@ -93,13 +48,6 @@ public class CrawlSpecGenerator {
|
||||
};
|
||||
}
|
||||
|
||||
static DomainSource knownUrlsFromDb(DbDomainStatsExportMultitool dbData) {
|
||||
return dbData::getAllIndexedDomains;
|
||||
}
|
||||
|
||||
static DomainSource fromCrawlQueue(DbDomainStatsExportMultitool dbData) {
|
||||
return dbData::getCrawlQueueDomains;
|
||||
}
|
||||
}
|
||||
|
||||
public interface KnownUrlsCountSource {
|
||||
@ -108,12 +56,6 @@ public class CrawlSpecGenerator {
|
||||
static KnownUrlsCountSource fixed(int value) {
|
||||
return domainName -> value;
|
||||
}
|
||||
|
||||
static KnownUrlsCountSource fromDb(DbDomainStatsExportMultitool dbData, int defaultValue) {
|
||||
return domainName ->
|
||||
dbData.getVisitedUrls(domainName)
|
||||
.orElse(defaultValue);
|
||||
}
|
||||
}
|
||||
|
||||
public interface KnownUrlsListSource {
|
||||
|
@ -20,6 +20,10 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DbCrawlSpecProvider.class);
|
||||
|
||||
private static final double URL_GROWTH_FACTOR = Double.parseDouble(System.getProperty("crawler.crawlSetGrowthFactor", "1.25"));
|
||||
private static final int MIN_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 100);
|
||||
private static final int MAX_URLS_PER_DOMAIN = Integer.getInteger("crawler.maxUrlsPerDomain", 10_000);
|
||||
|
||||
@Inject
|
||||
public DbCrawlSpecProvider(HikariDataSource dataSource,
|
||||
ProcessConfiguration processConfiguration
|
||||
@ -48,7 +52,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
|
||||
while (rs.next()) {
|
||||
domains.add(new CrawlSpecRecord(
|
||||
rs.getString(1),
|
||||
Math.clamp((int) (1.25 * rs.getInt(2)), 250, 10_000),
|
||||
Math.clamp((int) (URL_GROWTH_FACTOR * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN),
|
||||
List.of()
|
||||
));
|
||||
}
|
||||
|
@ -12,6 +12,7 @@ import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@ -22,9 +23,10 @@ import static nu.marginalia.crawlspec.CrawlSpecGenerator.*;
|
||||
@Singleton
|
||||
public class CrawlJobExtractorActor extends RecordActorPrototype {
|
||||
|
||||
private static final int INITIAL_URLS_PER_DOMAIN = Integer.getInteger("crawler.initialUrlsPerDomain", 250);
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final FileStorageService fileStorageService;
|
||||
|
||||
@Inject
|
||||
public CrawlJobExtractorActor(Gson gson,
|
||||
FileStorageService fileStorageService
|
||||
@ -44,12 +46,10 @@ public class CrawlJobExtractorActor extends RecordActorPrototype {
|
||||
|
||||
Path urlsTxt = storage.asPath().resolve("urls.txt");
|
||||
|
||||
try (var os = Files.newOutputStream(urlsTxt, StandardOpenOption.CREATE_NEW);
|
||||
var is = new URL(url).openStream())
|
||||
{
|
||||
is.transferTo(os);
|
||||
try {
|
||||
downloadToFile(url, urlsTxt);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
catch (IOException ex) {
|
||||
fileStorageService.flagFileForDeletion(storage.id());
|
||||
yield new Error("Error downloading " + url);
|
||||
}
|
||||
@ -59,7 +59,7 @@ public class CrawlJobExtractorActor extends RecordActorPrototype {
|
||||
generateCrawlSpec(
|
||||
path,
|
||||
DomainSource.fromFile(urlsTxt),
|
||||
KnownUrlsCountSource.fixed(200),
|
||||
KnownUrlsCountSource.fixed(INITIAL_URLS_PER_DOMAIN),
|
||||
KnownUrlsListSource.justIndex()
|
||||
);
|
||||
|
||||
@ -69,6 +69,14 @@ public class CrawlJobExtractorActor extends RecordActorPrototype {
|
||||
};
|
||||
}
|
||||
|
||||
private void downloadToFile(String url, Path urlsTxt) throws IOException {
|
||||
try (var os = Files.newOutputStream(urlsTxt, StandardOpenOption.CREATE_NEW);
|
||||
var is = new URL(url).openStream())
|
||||
{
|
||||
is.transferTo(os);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Run the crawler job extractor process";
|
||||
|
@ -7,6 +7,7 @@ The system will look for a properties file in `conf/properties/system.properties
|
||||
within the install dir, as specified by `$WMSA_HOME`.
|
||||
|
||||
A template is available in [../run/template/conf/properties/system.properties](../run/template/conf/properties/system.properties).
|
||||
|
||||
## Global
|
||||
|
||||
| flag | values | description |
|
||||
@ -16,12 +17,16 @@ A template is available in [../run/template/conf/properties/system.properties](.
|
||||
|
||||
## Crawler Properties
|
||||
|
||||
| flag | values | description |
|
||||
|-----------------------------|------------|------------------------------------------------------------------------------------------|
|
||||
| crawler.userAgentString | string | Sets the user agent string used by the crawler |
|
||||
| crawler.userAgentIdentifier | string | Sets the user agent identifier used by the crawler, e.g. what it looks for in robots.txt |
|
||||
| crawler.poolSize | integer | Sets the number of threads used by the crawler, more is faster, but uses more RAM |
|
||||
| ip-blocklist.disabled | boolean | Disables the IP blocklist |
|
||||
| flag | values | description |
|
||||
|------------------------------|------------|---------------------------------------------------------------------------------------------|
|
||||
| crawler.userAgentString | string | Sets the user agent string used by the crawler |
|
||||
| crawler.userAgentIdentifier | string | Sets the user agent identifier used by the crawler, e.g. what it looks for in robots.txt |
|
||||
| crawler.poolSize | integer | Sets the number of threads used by the crawler, more is faster, but uses more RAM |
|
||||
| crawler.initialUrlsPerDomain | integer | Sets the initial number of URLs to crawl per domain (when crawling from spec) |
|
||||
| crawler.maxUrlsPerDomain | integer | Sets the maximum number of URLs to crawl per domain (when recrawling) |
|
||||
| crawler.minUrlsPerDomain | integer | Sets the minimum number of URLs to crawl per domain (when recrawling) |
|
||||
| crawler.crawlSetGrowthFactor | double | If 100 documents were fetched last crawl, increase the goal to 100 x (this value) this time |
|
||||
| ip-blocklist.disabled | boolean | Disables the IP blocklist |
|
||||
|
||||
## Converter Properties
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user