(*) Refactor GeoIP-related code

In this commit, GeoIP-related classes are refactored and relocated to a common library as they are shared across multiple services.

The crawler is refactored to enable the GeoIpBlocklist to use the new GeoIpDictionary as the base of its decisions.

The converter is modified ot query this data to add a geoip:-keyword to documents to permit limiting a search to the country of the hosting server.

The commit also adds due BY-SA attribution in the search engine footer for the source of the IP geolocation data.
This commit is contained in:
Viktor Lofgren 2023-12-10 17:30:43 +01:00
parent 84b4158555
commit f655ec5a5c
21 changed files with 135 additions and 91 deletions

View File

@ -15,6 +15,7 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:libraries:guarded-regex')
implementation project(':code:libraries:geo-ip')
implementation libs.notnull

View File

@ -1,73 +1,31 @@
package nu.marginalia.ip_blocklist;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.opencsv.CSVReader;
import com.opencsv.exceptions.CsvValidationException;
import lombok.AllArgsConstructor;
import nu.marginalia.WmsaHome;
import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.io.IOException;
import java.net.InetAddress;
import java.util.Set;
import java.util.TreeMap;
@Singleton
public class GeoIpBlocklist {
private final TreeMap<Long, GeoIpBlocklist.IpRange> ranges = new TreeMap<>();
/** These countries are extremely overrepresented among the problematic and spammy domains,
* and blocking them is by far the most effective spam mitigation technique. Sucks we throw
* babies out with the bathwater, but it's undeniably effective.
*/
private final Set<String> blacklist = Set.of("CN", "HK");
private final Set<String> graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA");
private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class);
@AllArgsConstructor
static class IpRange {
public final long from;
public final long to;
public final String country;
}
private final GeoIpDictionary ipDictionary;
public GeoIpBlocklist() throws IOException, CsvValidationException {
var resource = WmsaHome.getIPLocationDatabse();
try (var reader = new CSVReader(new FileReader(resource.toFile()))) {
for (;;) {
String[] vals = reader.readNext();
if (vals == null) {
break;
}
if (!(blacklist.contains(vals[2]) || graylist.contains(vals[2]))) {
continue;
}
var range = new GeoIpBlocklist.IpRange(Long.parseLong(vals[0]),
Long.parseLong(vals[1]),
vals[2]);
ranges.put(range.from, range);
}
}
logger.info("Loaded {} IP ranges", ranges.size());
}
public String getCountry(InetAddress address) {
byte[] bytes = address.getAddress();
long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF);
Long key = ranges.floorKey(ival);
if (null == key) {
return "-";
}
var range = ranges.get(key);
if (ival >= key && ival < range.to) {
return range.country;
}
return "-";
@Inject
public GeoIpBlocklist(GeoIpDictionary ipDictionary) {
this.ipDictionary = ipDictionary;
ipDictionary.waitReady();
}
public boolean isAllowed(EdgeDomain domain) {
@ -85,7 +43,7 @@ public class GeoIpBlocklist {
public String getCountry(EdgeDomain domain) {
try {
return getCountry(InetAddressCache.getAddress(domain));
return ipDictionary.getCountry(InetAddressCache.getAddress(domain));
}
catch (Throwable ex) {
logger.debug("Failed to resolve {}", domain);

View File

@ -11,7 +11,7 @@ import java.util.concurrent.TimeUnit;
// We don't want to torture the DNS by resolving the same links over and over and over again
public class InetAddressCache {
private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(10_000_000).expireAfterAccess(1, TimeUnit.HOURS).build();
private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(1_000_000).expireAfterAccess(1, TimeUnit.HOURS).build();
public static InetAddress getAddress(EdgeDomain domain) throws Throwable {
try {
return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress()));

View File

@ -0,0 +1,24 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}
dependencies {
implementation project(':code:common:config')
implementation libs.bundles.slf4j
implementation libs.opencsv
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}

View File

@ -0,0 +1,6 @@
This micro library handles the GeoIP lookups, mappings from IP addresses
to country codes.
It uses the free ip2location lite database, which is
available from [https://lite.ip2location.com/database/ip-country](https://lite.ip2location.com/database/ip-country)
under a CC-BY-SA 4.0 license.

View File

@ -1,7 +1,6 @@
package nu.marginalia.assistant.domains;
package nu.marginalia.geoip;
import com.opencsv.CSVReader;
import lombok.AllArgsConstructor;
import nu.marginalia.WmsaHome;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -14,12 +13,7 @@ public class GeoIpDictionary {
private volatile TreeMap<Long, IpRange> ranges = null;
private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class);
@AllArgsConstructor
static class IpRange {
public final long from;
public final long to;
public final String country;
}
record IpRange(long from, long to, String country) {}
public GeoIpDictionary() {
Thread.ofPlatform().start(() -> {
@ -39,10 +33,28 @@ public class GeoIpDictionary {
ranges = dict;
logger.info("Loaded {} IP ranges", ranges.size());
} catch (Exception e) {
ranges = new TreeMap<>();
throw new RuntimeException(e);
}
finally {
this.notifyAll();
}
});
}
public boolean isReady() {
return null != ranges;
}
public boolean waitReady() {
while (null == ranges) {
try {
this.wait();
} catch (InterruptedException e) {
return false;
}
}
return true;
}
public String getCountry(String ip) {

View File

@ -8,7 +8,6 @@ import org.apache.parquet.schema.*;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.sql.Array;
import java.util.ArrayList;
import java.util.List;

View File

@ -1,5 +1,14 @@
package nu.marginalia.model.processed;
import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.ToString;
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode
@ToString
public class DomainWithIp {
public String domain;
public String ip;

View File

@ -41,6 +41,7 @@ dependencies {
implementation project(':code:libraries:guarded-regex')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:big-string')
implementation project(':code:libraries:language-processing')

View File

@ -10,6 +10,7 @@ import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.logic.links.LinkGraph;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.*;
import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeDomain;
@ -30,6 +31,7 @@ public class DomainProcessor {
private final AnchorTagsSource anchorTagsSource;
private final AnchorTextKeywords anchorTextKeywords;
private final LshDocumentDeduplicator documentDeduplicator;
private final GeoIpDictionary geoIpDictionary;
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -38,17 +40,21 @@ public class DomainProcessor {
SiteWords siteWords,
AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords,
LshDocumentDeduplicator documentDeduplicator) throws SQLException
LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException
{
this.documentProcessor = documentProcessor;
this.siteWords = siteWords;
this.anchorTextKeywords = anchorTextKeywords;
this.documentDeduplicator = documentDeduplicator;
this.anchorTagsSource = anchorTagsSourceFactory.create();
this.geoIpDictionary = geoIpDictionary;
}
@SneakyThrows
public ProcessedDomain process(SerializableCrawlDataStream dataStream) {
geoIpDictionary.waitReady();
var ret = new ProcessedDomain();
List<ProcessedDocument> docs = new ArrayList<>();
@ -107,7 +113,14 @@ public class DomainProcessor {
// Add late keywords and features from domain-level information
List<String> terms = new ArrayList<>();
terms.add("ip:"+ip);
String geoIp = geoIpDictionary.getCountry(ip);
if (!geoIp.isBlank()) {
terms.add("geoip:"+geoIp.toLowerCase());
}
if (cookies) {
terms.add(HtmlFeature.COOKIES.getKeyword());
}

View File

@ -6,6 +6,7 @@ import lombok.SneakyThrows;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
@ -75,7 +76,7 @@ public class CrawlingThenConvertingIntegrationTest {
private CrawledDomain crawl(CrawlSpecRecord specs) {
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get();
data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add);

View File

@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.atags.source.AnchorTagsSource;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.spec.CrawlSpecProvider;
import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
@ -56,6 +57,7 @@ public class CrawlerMain {
private final UserAgent userAgent;
private final MessageQueueFactory messageQueueFactory;
private final DomainProber domainProber;
private final FileStorageService fileStorageService;
private final DbCrawlSpecProvider dbCrawlSpecProvider;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
@ -75,7 +77,7 @@ public class CrawlerMain {
@Inject
public CrawlerMain(UserAgent userAgent,
ProcessHeartbeatImpl heartbeat,
MessageQueueFactory messageQueueFactory,
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
FileStorageService fileStorageService,
ProcessConfiguration processConfiguration,
DbCrawlSpecProvider dbCrawlSpecProvider,
@ -84,6 +86,7 @@ public class CrawlerMain {
this.heartbeat = heartbeat;
this.userAgent = userAgent;
this.messageQueueFactory = messageQueueFactory;
this.domainProber = domainProber;
this.fileStorageService = fileStorageService;
this.dbCrawlSpecProvider = dbCrawlSpecProvider;
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
@ -219,7 +222,7 @@ public class CrawlerMain {
var domainLinks = anchorTagsSource.getAnchorTags(domain);
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, writer::accept);
int size = retreiver.fetch(domainLinks, reference);
workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size);

View File

@ -42,7 +42,7 @@ public class CrawlerRetreiver {
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
private static final DomainProber domainProber = new DomainProber();
private final DomainProber domainProber;
private final SitemapRetriever sitemapRetriever;
private final DomainCrawlFrontier crawlFrontier;
@ -55,9 +55,11 @@ public class CrawlerRetreiver {
private static final String documentWasSameTag = "SAME-BY-COMPARISON";
public CrawlerRetreiver(HttpFetcher fetcher,
DomainProber domainProber,
CrawlSpecRecord specs,
Consumer<SerializableCrawlData> writer) {
this.fetcher = fetcher;
this.domainProber = domainProber;
domain = specs.domain;

View File

@ -1,5 +1,7 @@
package nu.marginalia.crawl.retreival;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawling.model.CrawlerDomainStatus;
@ -11,17 +13,21 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.util.function.Predicate;
@Singleton
public class DomainProber {
private final Logger logger = LoggerFactory.getLogger(DomainProber.class);
private static IpBlockList ipBlockList;
private final Predicate<EdgeDomain> domainBlacklist;
static {
try {
ipBlockList = new IpBlockList(new GeoIpBlocklist());
} catch (Exception e) {
throw new RuntimeException(e);
}
@Inject
public DomainProber(IpBlockList ipBlockList) {
this.domainBlacklist = ipBlockList::isAllowed;
}
/** For testing */
public DomainProber(Predicate<EdgeDomain> domainBlacklist) {
this.domainBlacklist = domainBlacklist;
}
/** To detect problems early we do a probing request to the domain before we start crawling it properly.
@ -37,7 +43,7 @@ public class DomainProber {
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
}
if (!ipBlockList.isAllowed(firstUrlInQueue.domain))
if (!domainBlacklist.test(firstUrlInQueue.domain))
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
@ -62,7 +68,7 @@ public class DomainProber {
/** This domain redirects to another domain */
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
/** If the retreivala of the probed url was successful, return the url as it was fetched
/** If the retrieval of the probed url was successful, return the url as it was fetched
* (which may be different from the url we probed, if we attempted another URL schema).
*
* @param probedUrl The url we successfully probed

View File

@ -15,7 +15,6 @@ import nu.marginalia.model.EdgeUrl;
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
import nu.marginalia.crawl.retreival.logic.ContentTypeParser;
import okhttp3.*;
import org.apache.commons.collections4.queue.PredicatedQueue;
import org.apache.commons.io.input.BOMInputStream;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
@ -87,7 +86,10 @@ public class HttpFetcherImpl implements HttpFetcher {
}
@Inject
public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
public HttpFetcherImpl(@Named("user-agent") String userAgent,
Dispatcher dispatcher,
ConnectionPool connectionPool)
{
this.client = createClient(dispatcher, connectionPool);
this.userAgent = userAgent;
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.crawling.retreival;
import crawlercommons.robots.SimpleRobotRules;
import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.*;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
@ -68,7 +69,7 @@ public class CrawlerMockFetcherTest {
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add)
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add)
.fetch();
out.forEach(System.out::println);
@ -80,7 +81,7 @@ public class CrawlerMockFetcherTest {
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add)
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add)
.fetch();
out.forEach(System.out::println);
@ -94,7 +95,7 @@ public class CrawlerMockFetcherTest {
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add)
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add)
.fetch();
out.forEach(System.out::println);

View File

@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawling.io.CrawledDomainReader;
@ -53,7 +54,7 @@ class CrawlerRetreiverTest {
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
var fetchedUrls =
data.stream().filter(CrawledDocument.class::isInstance)
@ -82,7 +83,7 @@ class CrawlerRetreiverTest {
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
@ -118,7 +119,7 @@ class CrawlerRetreiverTest {
var writer = new CrawledDomainWriter(out, specs.domain, "idid");
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
new CrawlerRetreiver(httpFetcher, specs, d -> {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
if (d instanceof CrawledDocument doc) {
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
@ -136,7 +137,7 @@ class CrawlerRetreiverTest {
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
new CrawlerRetreiver(httpFetcher, specs, d -> {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
if (d instanceof CrawledDocument doc) {
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
}

View File

@ -99,7 +99,6 @@
</section>
<section id="legal">
<h1>Policies</h1>
This website complies with the GDPR by <em>not collecting any personal
information</em>, and with the EU Cookie Directive by <em>not using
cookies</em>. <a href="https://memex.marginalia.nu/projects/edge/privacy.gmi">More Information</a>.
@ -109,8 +108,13 @@
<h1> Open Source </h1>
The search engine is open source with an AGPL license. The sources can be perused at
<tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>.
<h1>Data Sources</h1>
IP geolocation is sourced from the IP2Location LITE data available from
<a rel="external noopener nofollow" href="https://lite.ip2location.com/">https://lite.ip2location.com/</a>
under
<a rel="external noopener nofollow" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA&nbsp;4.0</a>.
</section>
</footer>
<script src="/tts.js"></script>

View File

@ -33,6 +33,7 @@ dependencies {
implementation project(':code:features-search:screenshots')
implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')

View File

@ -1,9 +1,8 @@
package nu.marginalia.assistant.domains;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.assistant.client.model.DomainInformation;
import org.slf4j.Logger;

View File

@ -12,6 +12,7 @@ include 'code:services-application:dating-service'
include 'code:services-application:explorer-service'
include 'code:libraries:array'
include 'code:libraries:geo-ip'
include 'code:libraries:btree'
include 'code:libraries:easy-lsh'
include 'code:libraries:guarded-regex'