(*) Overhaul settings and properties

Use a system.properties file to configure the system.  This is loaded statically by MainClass or ProcessMainClass.  Update the property names to be more consistent, and update the documentations to reflect the changes.
This commit is contained in:
Viktor Lofgren 2024-01-13 17:12:18 +01:00
parent 176b9c9666
commit 7c6e18f7a7
25 changed files with 160 additions and 75 deletions

View File

@ -64,7 +64,7 @@ public class IndexClient extends AbstractDynamicClient {
.postGet(ctx, node, "/search/", specs, SearchResultSet.class).onErrorReturn(t -> new SearchResultSet())
.observeOn(Schedulers.io());
} catch (RouteNotConfiguredException ex) {
return Observable.error(ex);
return Observable.empty();
}
})
.reduce(SearchResultSet::combine)

View File

@ -1,3 +1,3 @@
package nu.marginalia;
public record UserAgent(String uaString) {}
public record UserAgent(String uaString, String uaIdentifier) {}

View File

@ -12,19 +12,19 @@ import java.util.Optional;
import java.util.stream.Stream;
public class WmsaHome {
public static UserAgent getUserAgent() throws IOException {
var uaPath = getHomePath().resolve("conf/user-agent");
public static UserAgent getUserAgent() {
if (!Files.exists(uaPath)) {
throw new FileNotFoundException("Could not find " + uaPath);
}
return new UserAgent(Files.readString(uaPath).trim());
return new UserAgent(
System.getProperty("crawler.userAgentString", "Mozilla/5.0 (compatible; Marginalia-like bot; +https://git.marginalia.nu/))"),
System.getProperty("crawler.userAgentIdentifier", "search.marginalia.nu")
);
}
public static Path getUploadDir() {
return Path.of("/uploads");
return Path.of(
System.getProperty("executor.uploadDir", "/uploads")
);
}
public static Path getHomePath() {
@ -93,11 +93,6 @@ public class WmsaHome {
public static Path getAtagsPath() {
return getHomePath().resolve("data/atags.parquet");
}
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
public static boolean isDebug() {
return debugMode;
}
}

View File

@ -16,7 +16,7 @@ public class DomainBlacklistImpl implements DomainBlacklist {
private volatile TIntHashSet spamDomainSet = new TIntHashSet();
private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final boolean blacklistDisabled = Boolean.getBoolean("no-domain-blacklist");
private final boolean blacklistDisabled = Boolean.getBoolean("blacklist.disable");
@Inject
public DomainBlacklistImpl(HikariDataSource dataSource) {
this.dataSource = dataSource;

View File

@ -0,0 +1,33 @@
package nu.marginalia.service;
import nu.marginalia.WmsaHome;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class ConfigLoader {
private static final Logger logger = LoggerFactory.getLogger(ConfigLoader.class);
static Path getConfigPath(String configName) {
return WmsaHome.getHomePath().resolve("conf/properties/" + configName + ".properties");
}
static void loadConfig(Path configPath) {
if (!Files.exists(configPath)) {
logger.info("No config file found at {}", configPath);
return;
}
logger.info("Loading config from {}", configPath);
try (var is = Files.newInputStream(configPath)) {
logger.info("Config:\n{}", Files.readString(configPath));
System.getProperties().load(is);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -15,7 +15,14 @@ import java.net.UnknownHostException;
* They must also invoke init() in their main method.
*/
public abstract class MainClass {
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final Logger logger = LoggerFactory.getLogger(MainClass.class);
static {
// Load global config ASAP
ConfigLoader.loadConfig(
ConfigLoader.getConfigPath("system")
);
}
public MainClass() {
RxJavaPlugins.setErrorHandler(this::handleError);
@ -42,11 +49,14 @@ public abstract class MainClass {
protected static void init(ServiceId id, String... args) {
System.setProperty("log4j2.isThreadContextMapInheritable", "true");
System.setProperty("isThreadContextMapInheritable", "true");
System.setProperty("service-name", id.name);
ConfigLoader.loadConfig(
ConfigLoader.getConfigPath(id.name)
);
initJdbc();
initPrometheus();
}

View File

@ -0,0 +1,20 @@
package nu.marginalia.service;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public abstract class ProcessMainClass {
private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class);
static {
// Load global config ASAP
ConfigLoader.loadConfig(
ConfigLoader.getConfigPath("system")
);
}
public ProcessMainClass() {
new org.mariadb.jdbc.Driver();
}
}

View File

@ -35,7 +35,7 @@ public class DatabaseModule extends AbstractModule {
dbProperties = loadDbProperties();
if (migrate) {
if (Boolean.getBoolean("disableFlyway")) {
if (Boolean.getBoolean("flyway.disable")) {
logger.info("Flyway disabled");
}
else {

View File

@ -22,7 +22,7 @@ public class IpBlockList {
private final GeoIpBlocklist geoIpBlocklist;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final List<SubnetUtils.SubnetInfo> badSubnets = new ArrayList<>();
private final boolean blocklistDisabled = Boolean.getBoolean("no-ip-blocklist");
private final boolean blocklistDisabled = Boolean.getBoolean("ip-blocklist.disabled");
@Inject
public IpBlockList(GeoIpBlocklist geoIpBlocklist) {

View File

@ -6,24 +6,14 @@ import nu.marginalia.array.algo.IntArraySearch;
import nu.marginalia.array.algo.IntArraySort;
import nu.marginalia.array.algo.IntArrayTransformations;
import nu.marginalia.array.delegate.ShiftedIntArray;
import nu.marginalia.array.delegate.ShiftedLongArray;
import nu.marginalia.array.page.SegmentIntArray;
import nu.marginalia.array.page.SegmentLongArray;
import nu.marginalia.array.scheme.ArrayPartitioningScheme;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Files;
import java.nio.file.Path;
public interface IntArray extends IntArrayBase, IntArrayTransformations, IntArraySearch, IntArraySort {
int WORD_SIZE = 4;
ArrayPartitioningScheme DEFAULT_PARTITIONING_SCHEME
= ArrayPartitioningScheme.forPartitionSize(Integer.getInteger("wmsa.page-size",1<<30) / WORD_SIZE);
int MAX_CONTINUOUS_SIZE = Integer.MAX_VALUE/WORD_SIZE - 16;
static IntArray allocate(long size) {
return SegmentIntArray.onHeap(Arena.ofShared(), size);
}

View File

@ -11,6 +11,7 @@ import nu.marginalia.converting.sideload.SideloadSourceFactory;
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.converting.writer.ConverterWriter;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mq.MqMessage;
@ -38,7 +39,7 @@ import java.util.concurrent.atomic.AtomicInteger;
import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX;
public class ConverterMain {
public class ConverterMain extends ProcessMainClass {
private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class);
private final DomainProcessor processor;
private final Gson gson;

View File

@ -32,6 +32,7 @@ import java.util.*;
import java.util.regex.Pattern;
public class DomainProcessor {
private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);
private final DocumentProcessor documentProcessor;
private final SiteWords siteWords;
private final AnchorTagsSource anchorTagsSource;
@ -59,7 +60,7 @@ public class DomainProcessor {
public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) {
final int sizeHint = domain.sizeHint();
if (sizeHint > 10_000) {
if (sizeHint > SIDELOAD_THRESHOLD) {
// If the file is too big, we run a processing mode that doesn't
// require loading the entire dataset into RAM
return sideloadProcessing(domain, sizeHint);

View File

@ -23,6 +23,7 @@ import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.CrawlerOutputFile;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
import nu.marginalia.crawlspec.CrawlSpecFileNames;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
import nu.marginalia.mq.MessageQueueFactory;
@ -51,7 +52,7 @@ import java.util.concurrent.atomic.AtomicInteger;
import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
public class CrawlerMain {
public class CrawlerMain extends ProcessMainClass {
private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
private final UserAgent userAgent;
@ -96,10 +97,10 @@ public class CrawlerMain {
this.node = processConfiguration.node();
pool = new SimpleBlockingThreadPool("CrawlerPool",
Integer.getInteger("crawler.pool-size", 256),
Integer.getInteger("crawler.poolSize", 256),
1);
fetcher = new HttpFetcherImpl(userAgent.uaString(),
fetcher = new HttpFetcherImpl(userAgent,
new Dispatcher(),
new ConnectionPool(5, 10, TimeUnit.SECONDS)
);

View File

@ -13,12 +13,12 @@ import java.util.Objects;
public class ContentTypeProber {
private static final Logger logger = LoggerFactory.getLogger(ContentTypeProber.class);
private final String userAgent;
private final String userAgentString;
private final OkHttpClient client;
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
public ContentTypeProber(String userAgent, OkHttpClient httpClient) {
this.userAgent = userAgent;
public ContentTypeProber(String userAgentString, OkHttpClient httpClient) {
this.userAgentString = userAgentString;
this.client = httpClient;
}
@ -35,7 +35,7 @@ public class ContentTypeProber {
logger.debug("Probing suspected binary {}", url);
var headBuilder = new Request.Builder().head()
.addHeader("User-agent", userAgent)
.addHeader("User-agent", userAgentString)
.addHeader("Accept-Encoding", "gzip")
.url(url.toString());

View File

@ -5,6 +5,7 @@ import com.google.inject.name.Named;
import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser;
import lombok.SneakyThrows;
import nu.marginalia.UserAgent;
import nu.marginalia.crawl.retreival.Cookies;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
@ -35,7 +36,8 @@ import java.util.concurrent.TimeUnit;
public class HttpFetcherImpl implements HttpFetcher {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final String userAgent;
private final String userAgentString;
private final String userAgentIdentifier;
private final Cookies cookies = new Cookies();
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
@ -85,18 +87,20 @@ public class HttpFetcherImpl implements HttpFetcher {
}
@Inject
public HttpFetcherImpl(@Named("user-agent") String userAgent,
public HttpFetcherImpl(UserAgent userAgent,
Dispatcher dispatcher,
ConnectionPool connectionPool)
{
this.client = createClient(dispatcher, connectionPool);
this.userAgent = userAgent;
this.contentTypeProber = new ContentTypeProber(userAgent, client);
this.userAgentString = userAgent.uaString();
this.userAgentIdentifier = userAgent.uaIdentifier();
this.contentTypeProber = new ContentTypeProber(userAgentString, client);
}
public HttpFetcherImpl(@Named("user-agent") String userAgent) {
public HttpFetcherImpl(String userAgent) {
this.client = createClient(null, new ConnectionPool());
this.userAgent = userAgent;
this.userAgentString = userAgent;
this.userAgentIdentifier = userAgent;
this.contentTypeProber = new ContentTypeProber(userAgent, client);
}
@ -110,7 +114,7 @@ public class HttpFetcherImpl implements HttpFetcher {
@Override
@SneakyThrows
public FetchResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
.url(url.toString())
.build();
@ -170,7 +174,7 @@ public class HttpFetcherImpl implements HttpFetcher {
getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip")
.addHeader("User-agent", userAgent);
.addHeader("User-agent", userAgentString);
contentTags.paint(getBuilder);
@ -212,7 +216,7 @@ public class HttpFetcherImpl implements HttpFetcher {
getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip")
.addHeader("User-agent", userAgent);
.addHeader("User-agent", userAgentString);
HttpFetchResult result = recorder.fetch(client, getBuilder.build());
@ -220,7 +224,7 @@ public class HttpFetcherImpl implements HttpFetcher {
robotsParser.parseContent(url.toString(),
body,
contentType.toString(),
userAgent)
userAgentIdentifier)
);
}

View File

@ -6,6 +6,7 @@ import com.google.inject.Inject;
import nu.marginalia.IndexLocations;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
@ -38,7 +39,7 @@ import java.util.function.LongPredicate;
import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
public class IndexConstructorMain {
public class IndexConstructorMain extends ProcessMainClass {
private final FileStorageService fileStorageService;
private final ProcessHeartbeatImpl heartbeat;
private final MessageQueueFactory messageQueueFactory;

View File

@ -8,6 +8,7 @@ import lombok.Getter;
import lombok.SneakyThrows;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.loading.documents.DocumentLoaderService;
@ -37,7 +38,7 @@ import java.util.concurrent.TimeUnit;
import static nu.marginalia.mqapi.ProcessInboxNames.LOADER_INBOX;
public class LoaderMain {
public class LoaderMain extends ProcessMainClass {
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
private final ProcessHeartbeatImpl heartbeat;

View File

@ -8,6 +8,7 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeatImpl;
import nu.marginalia.query.client.QueryClient;
import nu.marginalia.service.MainClass;
import nu.marginalia.service.module.DatabaseModule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -22,7 +23,7 @@ import java.util.stream.IntStream;
import static nu.marginalia.adjacencies.SparseBitVector.*;
public class WebsiteAdjacenciesCalculator {
public class WebsiteAdjacenciesCalculator extends MainClass {
private final HikariDataSource dataSource;
public AdjacenciesData adjacenciesData;
public DomainAliases domainAliases;

View File

@ -19,7 +19,7 @@ public class SearchModule extends AbstractModule {
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
bind(WebsiteUrl.class).toInstance(new WebsiteUrl(
System.getProperty("website-url", "https://search.marginalia.nu/")));
System.getProperty("search.websiteUrl", "https://search.marginalia.nu/")));
}
@Provides

View File

@ -25,7 +25,8 @@ public class ControlRendererFactory {
@SneakyThrows
public Renderer renderer(String template) {
Map<String, Object> globalContext = Map.of(
"nodes", nodeConfigurationService.getAll()
"nodes", nodeConfigurationService.getAll(),
"hideMarginaliaApp", Boolean.getBoolean("control.hideMarginaliaApp")
);
var baseRenderer = rendererFactory.renderer(template);

View File

@ -8,6 +8,7 @@
<div class="collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
<li class="nav-item"><a class="nav-link" href="/">Overview</a></li>
{{#unless hideMarginaliaApp}}
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Application</a>
<ul class="dropdown-menu">
@ -18,6 +19,7 @@
<li><a class="dropdown-item" href="/review-random-domains" title="Review random domains list">Random Exploration</a></li>
</ul>
</li>
{{/unless}}
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Index Nodes</a>
<ul class="dropdown-menu">

View File

@ -1,24 +1,37 @@
# System Properties
These are JVM system properties used by each service
These are JVM system properties used by each service. These properties can either
be loaded from a file or passed in as command line arguments, using `$JAVA_OPTS`.
## Search Service
| flag | values | description |
|-------------|------------|-------------------------------------------------------|
| website-url |https://search.marginalia.nu/|Overrides the website URL used in rendering|
The system will look for a properties file in `conf/properties/system.properties`,
within the install dir, as specified by `$WMSA_HOME`.
## Crawler Process
|flag| values | description |
|---|------------|-------------------------------------------------------|
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
A template is available in [../run/template/conf/properties/system.properties](../run/template/conf/properties/system.properties).
## Global
## Converter Process
|flag| values | description |
|---|------------|-------------------------------------------------------|
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
| flag | values | description |
|-------------|------------|--------------------------------------|
| blacklist.disable | boolean | Disables the IP blacklist |
| flyway.disable | boolean | Disables automatic Flyway migrations |
## Loader Process
|flag| values | description |
|---|------------|-------------------------------------------------------|
|local-index-path| /some/path | Selects the location the loader will write index data |
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
## Crawler Properties
| flag | values | description |
|-----------------------------|------------|------------------------------------------------------------------------------------------|
| crawler.userAgentString | string | Sets the user agent string used by the crawler |
| crawler.userAgentIdentifier | string | Sets the user agent identifier used by the crawler, e.g. what it looks for in robots.txt |
| crawler.poolSize | integer | Sets the number of threads used by the crawler, more is faster, but uses more RAM |
| ip-blocklist.disabled | boolean | Disables the IP blocklist |
## Converter Properties
| flag | values | description |
|-----------------------------|------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|
| converter.sideloadThreshold | integer | Threshold value, in number of documents per domain, where a simpler processing method is used which uses less RAM. 10,000 is a good value for ~32GB RAM |
# Marginalia Application Specific
| flag | values | description |
|---------------------------|------------|---------------------------------------------------------------|
| search.websiteUrl | string | Overrides the website URL used in rendering |
| control.hideMarginaliaApp | boolean | Hides the Marginalia application from the control GUI results |

3
run/env/service.env vendored
View File

@ -5,5 +5,4 @@ EXECUTOR_SERVICE_OPTS="-DdistPath=/dist"
CONVERTER_PROCESS_OPTS="-Dservice-name=converter -Dservice-host=0.0.0.0"
CRAWLER_PROCESS_OPTS="-Dservice-name=crawler -Dservice-host=0.0.0.0"
LOADER_PROCESS_OPTS="-Dservice-name=loader -Dservice-host=0.0.0.0"
INDEX_CONSTRUCTION_PROCESS_OPTS="-Dservice-name=index-constructor -Djava.util.concurrent.ForkJoinPool.common.parallelism=4"
SEARCH_SERVICE_OPTS="-Dwebsite-url=http://localhost:8080"
INDEX_CONSTRUCTION_PROCESS_OPTS="-Dservice-name=index-constructor -Djava.util.concurrent.ForkJoinPool.common.parallelism=4"

View File

@ -0,0 +1,13 @@
crawler.userAgentString = Mozilla/5.0 (compatible)
crawler.userAgentIdentifier = GoogleBot
crawler.poolSize = 256
search.websiteUrl = https://localhost:8080/
executor.uploadDir = /uploads
converter.sideloadThreshold = 10000
ip-blocklist.disabled = false
blacklist.disable = false
flyway.disable = false
control.hideMarginaliaApp = false

View File

@ -1 +0,0 @@
PoorlyConfiguredWebCrawler