diff --git a/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java b/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java index f8aa8ee6..87be3942 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java +++ b/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java @@ -16,7 +16,7 @@ public class DomainBlacklistImpl implements DomainBlacklist { private volatile TIntHashSet spamDomainSet = new TIntHashSet(); private final HikariDataSource dataSource; private final Logger logger = LoggerFactory.getLogger(getClass()); - + private final boolean blacklistDisabled = Boolean.getBoolean("no-domain-blacklist"); @Inject public DomainBlacklistImpl(HikariDataSource dataSource) { this.dataSource = dataSource; @@ -27,6 +27,7 @@ public class DomainBlacklistImpl implements DomainBlacklist { } private void updateSpamList() { + try { int oldSetSize = spamDomainSet.size(); @@ -46,6 +47,10 @@ public class DomainBlacklistImpl implements DomainBlacklist { public TIntHashSet getSpamDomains() { final TIntHashSet result = new TIntHashSet(1_000_000); + if (blacklistDisabled) { + return result; + } + try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) { stmt.setFetchSize(1000); @@ -61,6 +66,7 @@ public class DomainBlacklistImpl implements DomainBlacklist { @Override public boolean isBlacklisted(int domainId) { + if (spamDomainSet.contains(domainId)) { return true; } diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java index cf81c75d..a339b1d4 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java @@ -22,11 +22,18 @@ public class IpBlockList { private final GeoIpBlocklist geoIpBlocklist; private final Logger logger = LoggerFactory.getLogger(getClass()); private final List badSubnets = new ArrayList<>(); + private final boolean blocklistDisabled = Boolean.getBoolean("no-ip-blocklist"); @Inject public IpBlockList(GeoIpBlocklist geoIpBlocklist) { this.geoIpBlocklist = geoIpBlocklist; + if (blocklistDisabled) { + logger.warn("IP blocklist disabled"); + // no point loading the list here + return; + } + var resource = Objects.requireNonNull( ClassLoader.getSystemResourceAsStream("ip-banned-cidr.txt"), "Could not load IP blacklist"); @@ -52,6 +59,9 @@ public class IpBlockList { final Predicate numericPattern = Pattern.compile(".*\\d{4}.*").asMatchPredicate(); public boolean isAllowed(EdgeDomain domain) { + if (blocklistDisabled) + return true; + if (domain.domain.endsWith(".cn")) { logger.debug("Blocking {} on .cn-end", domain); return false; @@ -64,12 +74,15 @@ public class IpBlockList { try { var hostAddress = InetAddressCache.getAddress(domain).getHostAddress(); var subnet = badSubnets.stream().filter(sn -> sn.isInRange(hostAddress)).findFirst(); + if (subnet.isPresent()) { logger.debug("Blocking {} on IP range: {}", domain, subnet.get()); return false; } + } catch (Throwable t) { - return false; + // Host failed ot resolve, deal with crawling error upstream + // to avoid flagging this as a blocked domain } var geo = geoIpBlocklist.isAllowed(domain);