(*) Overhaul settings and properties
Use a system.properties file to configure the system. This is loaded statically by MainClass or ProcessMainClass. Update the property names to be more consistent, and update the documentations to reflect the changes.
This commit is contained in:
parent
176b9c9666
commit
7c6e18f7a7
@ -64,7 +64,7 @@ public class IndexClient extends AbstractDynamicClient {
|
||||
.postGet(ctx, node, "/search/", specs, SearchResultSet.class).onErrorReturn(t -> new SearchResultSet())
|
||||
.observeOn(Schedulers.io());
|
||||
} catch (RouteNotConfiguredException ex) {
|
||||
return Observable.error(ex);
|
||||
return Observable.empty();
|
||||
}
|
||||
})
|
||||
.reduce(SearchResultSet::combine)
|
||||
|
@ -1,3 +1,3 @@
|
||||
package nu.marginalia;
|
||||
|
||||
public record UserAgent(String uaString) {}
|
||||
public record UserAgent(String uaString, String uaIdentifier) {}
|
||||
|
@ -12,19 +12,19 @@ import java.util.Optional;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class WmsaHome {
|
||||
public static UserAgent getUserAgent() throws IOException {
|
||||
var uaPath = getHomePath().resolve("conf/user-agent");
|
||||
public static UserAgent getUserAgent() {
|
||||
|
||||
if (!Files.exists(uaPath)) {
|
||||
throw new FileNotFoundException("Could not find " + uaPath);
|
||||
}
|
||||
|
||||
return new UserAgent(Files.readString(uaPath).trim());
|
||||
return new UserAgent(
|
||||
System.getProperty("crawler.userAgentString", "Mozilla/5.0 (compatible; Marginalia-like bot; +https://git.marginalia.nu/))"),
|
||||
System.getProperty("crawler.userAgentIdentifier", "search.marginalia.nu")
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public static Path getUploadDir() {
|
||||
return Path.of("/uploads");
|
||||
return Path.of(
|
||||
System.getProperty("executor.uploadDir", "/uploads")
|
||||
);
|
||||
}
|
||||
|
||||
public static Path getHomePath() {
|
||||
@ -93,11 +93,6 @@ public class WmsaHome {
|
||||
public static Path getAtagsPath() {
|
||||
return getHomePath().resolve("data/atags.parquet");
|
||||
}
|
||||
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
|
||||
|
||||
public static boolean isDebug() {
|
||||
return debugMode;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -16,7 +16,7 @@ public class DomainBlacklistImpl implements DomainBlacklist {
|
||||
private volatile TIntHashSet spamDomainSet = new TIntHashSet();
|
||||
private final HikariDataSource dataSource;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final boolean blacklistDisabled = Boolean.getBoolean("no-domain-blacklist");
|
||||
private final boolean blacklistDisabled = Boolean.getBoolean("blacklist.disable");
|
||||
@Inject
|
||||
public DomainBlacklistImpl(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
|
@ -0,0 +1,33 @@
|
||||
package nu.marginalia.service;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ConfigLoader {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ConfigLoader.class);
|
||||
|
||||
static Path getConfigPath(String configName) {
|
||||
return WmsaHome.getHomePath().resolve("conf/properties/" + configName + ".properties");
|
||||
}
|
||||
|
||||
static void loadConfig(Path configPath) {
|
||||
if (!Files.exists(configPath)) {
|
||||
logger.info("No config file found at {}", configPath);
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info("Loading config from {}", configPath);
|
||||
|
||||
try (var is = Files.newInputStream(configPath)) {
|
||||
logger.info("Config:\n{}", Files.readString(configPath));
|
||||
System.getProperties().load(is);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
@ -15,7 +15,14 @@ import java.net.UnknownHostException;
|
||||
* They must also invoke init() in their main method.
|
||||
*/
|
||||
public abstract class MainClass {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private static final Logger logger = LoggerFactory.getLogger(MainClass.class);
|
||||
|
||||
static {
|
||||
// Load global config ASAP
|
||||
ConfigLoader.loadConfig(
|
||||
ConfigLoader.getConfigPath("system")
|
||||
);
|
||||
}
|
||||
|
||||
public MainClass() {
|
||||
RxJavaPlugins.setErrorHandler(this::handleError);
|
||||
@ -42,11 +49,14 @@ public abstract class MainClass {
|
||||
|
||||
|
||||
protected static void init(ServiceId id, String... args) {
|
||||
|
||||
System.setProperty("log4j2.isThreadContextMapInheritable", "true");
|
||||
System.setProperty("isThreadContextMapInheritable", "true");
|
||||
System.setProperty("service-name", id.name);
|
||||
|
||||
ConfigLoader.loadConfig(
|
||||
ConfigLoader.getConfigPath(id.name)
|
||||
);
|
||||
|
||||
initJdbc();
|
||||
initPrometheus();
|
||||
}
|
||||
|
@ -0,0 +1,20 @@
|
||||
package nu.marginalia.service;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public abstract class ProcessMainClass {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class);
|
||||
|
||||
static {
|
||||
// Load global config ASAP
|
||||
ConfigLoader.loadConfig(
|
||||
ConfigLoader.getConfigPath("system")
|
||||
);
|
||||
}
|
||||
|
||||
public ProcessMainClass() {
|
||||
new org.mariadb.jdbc.Driver();
|
||||
}
|
||||
|
||||
}
|
@ -35,7 +35,7 @@ public class DatabaseModule extends AbstractModule {
|
||||
dbProperties = loadDbProperties();
|
||||
|
||||
if (migrate) {
|
||||
if (Boolean.getBoolean("disableFlyway")) {
|
||||
if (Boolean.getBoolean("flyway.disable")) {
|
||||
logger.info("Flyway disabled");
|
||||
}
|
||||
else {
|
||||
|
@ -22,7 +22,7 @@ public class IpBlockList {
|
||||
private final GeoIpBlocklist geoIpBlocklist;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final List<SubnetUtils.SubnetInfo> badSubnets = new ArrayList<>();
|
||||
private final boolean blocklistDisabled = Boolean.getBoolean("no-ip-blocklist");
|
||||
private final boolean blocklistDisabled = Boolean.getBoolean("ip-blocklist.disabled");
|
||||
|
||||
@Inject
|
||||
public IpBlockList(GeoIpBlocklist geoIpBlocklist) {
|
||||
|
@ -6,24 +6,14 @@ import nu.marginalia.array.algo.IntArraySearch;
|
||||
import nu.marginalia.array.algo.IntArraySort;
|
||||
import nu.marginalia.array.algo.IntArrayTransformations;
|
||||
import nu.marginalia.array.delegate.ShiftedIntArray;
|
||||
import nu.marginalia.array.delegate.ShiftedLongArray;
|
||||
import nu.marginalia.array.page.SegmentIntArray;
|
||||
import nu.marginalia.array.page.SegmentLongArray;
|
||||
import nu.marginalia.array.scheme.ArrayPartitioningScheme;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public interface IntArray extends IntArrayBase, IntArrayTransformations, IntArraySearch, IntArraySort {
|
||||
int WORD_SIZE = 4;
|
||||
|
||||
ArrayPartitioningScheme DEFAULT_PARTITIONING_SCHEME
|
||||
= ArrayPartitioningScheme.forPartitionSize(Integer.getInteger("wmsa.page-size",1<<30) / WORD_SIZE);
|
||||
|
||||
int MAX_CONTINUOUS_SIZE = Integer.MAX_VALUE/WORD_SIZE - 16;
|
||||
|
||||
static IntArray allocate(long size) {
|
||||
return SegmentIntArray.onHeap(Arena.ofShared(), size);
|
||||
}
|
||||
|
@ -11,6 +11,7 @@ import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.converting.writer.ConverterWriter;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
@ -38,7 +39,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX;
|
||||
|
||||
public class ConverterMain {
|
||||
public class ConverterMain extends ProcessMainClass {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class);
|
||||
private final DomainProcessor processor;
|
||||
private final Gson gson;
|
||||
|
@ -32,6 +32,7 @@ import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class DomainProcessor {
|
||||
private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);
|
||||
private final DocumentProcessor documentProcessor;
|
||||
private final SiteWords siteWords;
|
||||
private final AnchorTagsSource anchorTagsSource;
|
||||
@ -59,7 +60,7 @@ public class DomainProcessor {
|
||||
public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) {
|
||||
final int sizeHint = domain.sizeHint();
|
||||
|
||||
if (sizeHint > 10_000) {
|
||||
if (sizeHint > SIDELOAD_THRESHOLD) {
|
||||
// If the file is too big, we run a processing mode that doesn't
|
||||
// require loading the entire dataset into RAM
|
||||
return sideloadProcessing(domain, sizeHint);
|
||||
|
@ -23,6 +23,7 @@ import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.io.CrawlerOutputFile;
|
||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.crawlspec.CrawlSpecFileNames;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
@ -51,7 +52,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
|
||||
|
||||
public class CrawlerMain {
|
||||
public class CrawlerMain extends ProcessMainClass {
|
||||
private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
|
||||
|
||||
private final UserAgent userAgent;
|
||||
@ -96,10 +97,10 @@ public class CrawlerMain {
|
||||
this.node = processConfiguration.node();
|
||||
|
||||
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
||||
Integer.getInteger("crawler.pool-size", 256),
|
||||
Integer.getInteger("crawler.poolSize", 256),
|
||||
1);
|
||||
|
||||
fetcher = new HttpFetcherImpl(userAgent.uaString(),
|
||||
fetcher = new HttpFetcherImpl(userAgent,
|
||||
new Dispatcher(),
|
||||
new ConnectionPool(5, 10, TimeUnit.SECONDS)
|
||||
);
|
||||
|
@ -13,12 +13,12 @@ import java.util.Objects;
|
||||
public class ContentTypeProber {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ContentTypeProber.class);
|
||||
private final String userAgent;
|
||||
private final String userAgentString;
|
||||
private final OkHttpClient client;
|
||||
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||
|
||||
public ContentTypeProber(String userAgent, OkHttpClient httpClient) {
|
||||
this.userAgent = userAgent;
|
||||
public ContentTypeProber(String userAgentString, OkHttpClient httpClient) {
|
||||
this.userAgentString = userAgentString;
|
||||
this.client = httpClient;
|
||||
}
|
||||
|
||||
@ -35,7 +35,7 @@ public class ContentTypeProber {
|
||||
logger.debug("Probing suspected binary {}", url);
|
||||
|
||||
var headBuilder = new Request.Builder().head()
|
||||
.addHeader("User-agent", userAgent)
|
||||
.addHeader("User-agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.url(url.toString());
|
||||
|
||||
|
@ -5,6 +5,7 @@ import com.google.inject.name.Named;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.retreival.Cookies;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
|
||||
@ -35,7 +36,8 @@ import java.util.concurrent.TimeUnit;
|
||||
public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final String userAgent;
|
||||
private final String userAgentString;
|
||||
private final String userAgentIdentifier;
|
||||
private final Cookies cookies = new Cookies();
|
||||
|
||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||
@ -85,18 +87,20 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
@Inject
|
||||
public HttpFetcherImpl(@Named("user-agent") String userAgent,
|
||||
public HttpFetcherImpl(UserAgent userAgent,
|
||||
Dispatcher dispatcher,
|
||||
ConnectionPool connectionPool)
|
||||
{
|
||||
this.client = createClient(dispatcher, connectionPool);
|
||||
this.userAgent = userAgent;
|
||||
this.contentTypeProber = new ContentTypeProber(userAgent, client);
|
||||
this.userAgentString = userAgent.uaString();
|
||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||
this.contentTypeProber = new ContentTypeProber(userAgentString, client);
|
||||
}
|
||||
|
||||
public HttpFetcherImpl(@Named("user-agent") String userAgent) {
|
||||
public HttpFetcherImpl(String userAgent) {
|
||||
this.client = createClient(null, new ConnectionPool());
|
||||
this.userAgent = userAgent;
|
||||
this.userAgentString = userAgent;
|
||||
this.userAgentIdentifier = userAgent;
|
||||
this.contentTypeProber = new ContentTypeProber(userAgent, client);
|
||||
}
|
||||
|
||||
@ -110,7 +114,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public FetchResult probeDomain(EdgeUrl url) {
|
||||
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
|
||||
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
|
||||
.url(url.toString())
|
||||
.build();
|
||||
|
||||
@ -170,7 +174,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
getBuilder.url(url.toString())
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("User-agent", userAgent);
|
||||
.addHeader("User-agent", userAgentString);
|
||||
|
||||
contentTags.paint(getBuilder);
|
||||
|
||||
@ -212,7 +216,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
getBuilder.url(url.toString())
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("User-agent", userAgent);
|
||||
.addHeader("User-agent", userAgentString);
|
||||
|
||||
HttpFetchResult result = recorder.fetch(client, getBuilder.build());
|
||||
|
||||
@ -220,7 +224,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
robotsParser.parseContent(url.toString(),
|
||||
body,
|
||||
contentType.toString(),
|
||||
userAgent)
|
||||
userAgentIdentifier)
|
||||
);
|
||||
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import com.google.inject.Inject;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.ProcessConfigurationModule;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
@ -38,7 +39,7 @@ import java.util.function.LongPredicate;
|
||||
|
||||
import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
|
||||
|
||||
public class IndexConstructorMain {
|
||||
public class IndexConstructorMain extends ProcessMainClass {
|
||||
private final FileStorageService fileStorageService;
|
||||
private final ProcessHeartbeatImpl heartbeat;
|
||||
private final MessageQueueFactory messageQueueFactory;
|
||||
|
@ -8,6 +8,7 @@ import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.ProcessConfigurationModule;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||
import nu.marginalia.loading.documents.DocumentLoaderService;
|
||||
@ -37,7 +38,7 @@ import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static nu.marginalia.mqapi.ProcessInboxNames.LOADER_INBOX;
|
||||
|
||||
public class LoaderMain {
|
||||
public class LoaderMain extends ProcessMainClass {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
|
||||
|
||||
private final ProcessHeartbeatImpl heartbeat;
|
||||
|
@ -8,6 +8,7 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import nu.marginalia.query.client.QueryClient;
|
||||
import nu.marginalia.service.MainClass;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -22,7 +23,7 @@ import java.util.stream.IntStream;
|
||||
|
||||
import static nu.marginalia.adjacencies.SparseBitVector.*;
|
||||
|
||||
public class WebsiteAdjacenciesCalculator {
|
||||
public class WebsiteAdjacenciesCalculator extends MainClass {
|
||||
private final HikariDataSource dataSource;
|
||||
public AdjacenciesData adjacenciesData;
|
||||
public DomainAliases domainAliases;
|
||||
|
@ -19,7 +19,7 @@ public class SearchModule extends AbstractModule {
|
||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||
|
||||
bind(WebsiteUrl.class).toInstance(new WebsiteUrl(
|
||||
System.getProperty("website-url", "https://search.marginalia.nu/")));
|
||||
System.getProperty("search.websiteUrl", "https://search.marginalia.nu/")));
|
||||
}
|
||||
|
||||
@Provides
|
||||
|
@ -25,7 +25,8 @@ public class ControlRendererFactory {
|
||||
@SneakyThrows
|
||||
public Renderer renderer(String template) {
|
||||
Map<String, Object> globalContext = Map.of(
|
||||
"nodes", nodeConfigurationService.getAll()
|
||||
"nodes", nodeConfigurationService.getAll(),
|
||||
"hideMarginaliaApp", Boolean.getBoolean("control.hideMarginaliaApp")
|
||||
);
|
||||
var baseRenderer = rendererFactory.renderer(template);
|
||||
|
||||
|
@ -8,6 +8,7 @@
|
||||
<div class="collapse navbar-collapse" id="navbarSupportedContent">
|
||||
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
|
||||
<li class="nav-item"><a class="nav-link" href="/">Overview</a></li>
|
||||
{{#unless hideMarginaliaApp}}
|
||||
<li class="nav-item dropdown">
|
||||
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Application</a>
|
||||
<ul class="dropdown-menu">
|
||||
@ -18,6 +19,7 @@
|
||||
<li><a class="dropdown-item" href="/review-random-domains" title="Review random domains list">Random Exploration</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
{{/unless}}
|
||||
<li class="nav-item dropdown">
|
||||
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Index Nodes</a>
|
||||
<ul class="dropdown-menu">
|
||||
|
@ -1,24 +1,37 @@
|
||||
# System Properties
|
||||
|
||||
These are JVM system properties used by each service
|
||||
These are JVM system properties used by each service. These properties can either
|
||||
be loaded from a file or passed in as command line arguments, using `$JAVA_OPTS`.
|
||||
|
||||
## Search Service
|
||||
| flag | values | description |
|
||||
|-------------|------------|-------------------------------------------------------|
|
||||
| website-url |https://search.marginalia.nu/|Overrides the website URL used in rendering|
|
||||
The system will look for a properties file in `conf/properties/system.properties`,
|
||||
within the install dir, as specified by `$WMSA_HOME`.
|
||||
|
||||
## Crawler Process
|
||||
|flag| values | description |
|
||||
|---|------------|-------------------------------------------------------|
|
||||
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
|
||||
A template is available in [../run/template/conf/properties/system.properties](../run/template/conf/properties/system.properties).
|
||||
## Global
|
||||
|
||||
## Converter Process
|
||||
|flag| values | description |
|
||||
|---|------------|-------------------------------------------------------|
|
||||
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
|
||||
| flag | values | description |
|
||||
|-------------|------------|--------------------------------------|
|
||||
| blacklist.disable | boolean | Disables the IP blacklist |
|
||||
| flyway.disable | boolean | Disables automatic Flyway migrations |
|
||||
|
||||
## Loader Process
|
||||
|flag| values | description |
|
||||
|---|------------|-------------------------------------------------------|
|
||||
|local-index-path| /some/path | Selects the location the loader will write index data |
|
||||
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
|
||||
## Crawler Properties
|
||||
|
||||
| flag | values | description |
|
||||
|-----------------------------|------------|------------------------------------------------------------------------------------------|
|
||||
| crawler.userAgentString | string | Sets the user agent string used by the crawler |
|
||||
| crawler.userAgentIdentifier | string | Sets the user agent identifier used by the crawler, e.g. what it looks for in robots.txt |
|
||||
| crawler.poolSize | integer | Sets the number of threads used by the crawler, more is faster, but uses more RAM |
|
||||
| ip-blocklist.disabled | boolean | Disables the IP blocklist |
|
||||
|
||||
## Converter Properties
|
||||
|
||||
| flag | values | description |
|
||||
|-----------------------------|------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| converter.sideloadThreshold | integer | Threshold value, in number of documents per domain, where a simpler processing method is used which uses less RAM. 10,000 is a good value for ~32GB RAM |
|
||||
|
||||
# Marginalia Application Specific
|
||||
|
||||
| flag | values | description |
|
||||
|---------------------------|------------|---------------------------------------------------------------|
|
||||
| search.websiteUrl | string | Overrides the website URL used in rendering |
|
||||
| control.hideMarginaliaApp | boolean | Hides the Marginalia application from the control GUI results |
|
||||
|
3
run/env/service.env
vendored
3
run/env/service.env
vendored
@ -5,5 +5,4 @@ EXECUTOR_SERVICE_OPTS="-DdistPath=/dist"
|
||||
CONVERTER_PROCESS_OPTS="-Dservice-name=converter -Dservice-host=0.0.0.0"
|
||||
CRAWLER_PROCESS_OPTS="-Dservice-name=crawler -Dservice-host=0.0.0.0"
|
||||
LOADER_PROCESS_OPTS="-Dservice-name=loader -Dservice-host=0.0.0.0"
|
||||
INDEX_CONSTRUCTION_PROCESS_OPTS="-Dservice-name=index-constructor -Djava.util.concurrent.ForkJoinPool.common.parallelism=4"
|
||||
SEARCH_SERVICE_OPTS="-Dwebsite-url=http://localhost:8080"
|
||||
INDEX_CONSTRUCTION_PROCESS_OPTS="-Dservice-name=index-constructor -Djava.util.concurrent.ForkJoinPool.common.parallelism=4"
|
13
run/template/conf/properties/system.properties
Normal file
13
run/template/conf/properties/system.properties
Normal file
@ -0,0 +1,13 @@
|
||||
crawler.userAgentString = Mozilla/5.0 (compatible)
|
||||
crawler.userAgentIdentifier = GoogleBot
|
||||
crawler.poolSize = 256
|
||||
|
||||
search.websiteUrl = https://localhost:8080/
|
||||
|
||||
executor.uploadDir = /uploads
|
||||
converter.sideloadThreshold = 10000
|
||||
|
||||
ip-blocklist.disabled = false
|
||||
blacklist.disable = false
|
||||
flyway.disable = false
|
||||
control.hideMarginaliaApp = false
|
@ -1 +0,0 @@
|
||||
PoorlyConfiguredWebCrawler
|
Loading…
Reference in New Issue
Block a user