(*) Refactor GeoIP-related code
In this commit, GeoIP-related classes are refactored and relocated to a common library as they are shared across multiple services. The crawler is refactored to enable the GeoIpBlocklist to use the new GeoIpDictionary as the base of its decisions. The converter is modified ot query this data to add a geoip:-keyword to documents to permit limiting a search to the country of the hosting server. The commit also adds due BY-SA attribution in the search engine footer for the source of the IP geolocation data.
This commit is contained in:
parent
84b4158555
commit
f655ec5a5c
@ -15,6 +15,7 @@ dependencies {
|
|||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:libraries:guarded-regex')
|
implementation project(':code:libraries:guarded-regex')
|
||||||
|
implementation project(':code:libraries:geo-ip')
|
||||||
|
|
||||||
implementation libs.notnull
|
implementation libs.notnull
|
||||||
|
|
||||||
|
@ -1,73 +1,31 @@
|
|||||||
package nu.marginalia.ip_blocklist;
|
package nu.marginalia.ip_blocklist;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.opencsv.CSVReader;
|
import nu.marginalia.geoip.GeoIpDictionary;
|
||||||
import com.opencsv.exceptions.CsvValidationException;
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import nu.marginalia.WmsaHome;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.FileReader;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.net.InetAddress;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class GeoIpBlocklist {
|
public class GeoIpBlocklist {
|
||||||
private final TreeMap<Long, GeoIpBlocklist.IpRange> ranges = new TreeMap<>();
|
/** These countries are extremely overrepresented among the problematic and spammy domains,
|
||||||
|
* and blocking them is by far the most effective spam mitigation technique. Sucks we throw
|
||||||
|
* babies out with the bathwater, but it's undeniably effective.
|
||||||
|
*/
|
||||||
private final Set<String> blacklist = Set.of("CN", "HK");
|
private final Set<String> blacklist = Set.of("CN", "HK");
|
||||||
private final Set<String> graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA");
|
private final Set<String> graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA");
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class);
|
private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class);
|
||||||
|
|
||||||
@AllArgsConstructor
|
private final GeoIpDictionary ipDictionary;
|
||||||
static class IpRange {
|
|
||||||
public final long from;
|
|
||||||
public final long to;
|
|
||||||
public final String country;
|
|
||||||
}
|
|
||||||
|
|
||||||
public GeoIpBlocklist() throws IOException, CsvValidationException {
|
@Inject
|
||||||
var resource = WmsaHome.getIPLocationDatabse();
|
public GeoIpBlocklist(GeoIpDictionary ipDictionary) {
|
||||||
|
this.ipDictionary = ipDictionary;
|
||||||
try (var reader = new CSVReader(new FileReader(resource.toFile()))) {
|
ipDictionary.waitReady();
|
||||||
for (;;) {
|
|
||||||
String[] vals = reader.readNext();
|
|
||||||
if (vals == null) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (!(blacklist.contains(vals[2]) || graylist.contains(vals[2]))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
var range = new GeoIpBlocklist.IpRange(Long.parseLong(vals[0]),
|
|
||||||
Long.parseLong(vals[1]),
|
|
||||||
vals[2]);
|
|
||||||
ranges.put(range.from, range);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Loaded {} IP ranges", ranges.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getCountry(InetAddress address) {
|
|
||||||
byte[] bytes = address.getAddress();
|
|
||||||
long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF);
|
|
||||||
|
|
||||||
Long key = ranges.floorKey(ival);
|
|
||||||
if (null == key) {
|
|
||||||
return "-";
|
|
||||||
}
|
|
||||||
|
|
||||||
var range = ranges.get(key);
|
|
||||||
if (ival >= key && ival < range.to) {
|
|
||||||
return range.country;
|
|
||||||
}
|
|
||||||
|
|
||||||
return "-";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isAllowed(EdgeDomain domain) {
|
public boolean isAllowed(EdgeDomain domain) {
|
||||||
@ -85,7 +43,7 @@ public class GeoIpBlocklist {
|
|||||||
|
|
||||||
public String getCountry(EdgeDomain domain) {
|
public String getCountry(EdgeDomain domain) {
|
||||||
try {
|
try {
|
||||||
return getCountry(InetAddressCache.getAddress(domain));
|
return ipDictionary.getCountry(InetAddressCache.getAddress(domain));
|
||||||
}
|
}
|
||||||
catch (Throwable ex) {
|
catch (Throwable ex) {
|
||||||
logger.debug("Failed to resolve {}", domain);
|
logger.debug("Failed to resolve {}", domain);
|
||||||
|
@ -11,7 +11,7 @@ import java.util.concurrent.TimeUnit;
|
|||||||
// We don't want to torture the DNS by resolving the same links over and over and over again
|
// We don't want to torture the DNS by resolving the same links over and over and over again
|
||||||
|
|
||||||
public class InetAddressCache {
|
public class InetAddressCache {
|
||||||
private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(10_000_000).expireAfterAccess(1, TimeUnit.HOURS).build();
|
private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(1_000_000).expireAfterAccess(1, TimeUnit.HOURS).build();
|
||||||
public static InetAddress getAddress(EdgeDomain domain) throws Throwable {
|
public static InetAddress getAddress(EdgeDomain domain) throws Throwable {
|
||||||
try {
|
try {
|
||||||
return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress()));
|
return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress()));
|
||||||
|
24
code/libraries/geo-ip/build.gradle
Normal file
24
code/libraries/geo-ip/build.gradle
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(21))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
implementation libs.opencsv
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
6
code/libraries/geo-ip/readme.md
Normal file
6
code/libraries/geo-ip/readme.md
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
This micro library handles the GeoIP lookups, mappings from IP addresses
|
||||||
|
to country codes.
|
||||||
|
|
||||||
|
It uses the free ip2location lite database, which is
|
||||||
|
available from [https://lite.ip2location.com/database/ip-country](https://lite.ip2location.com/database/ip-country)
|
||||||
|
under a CC-BY-SA 4.0 license.
|
@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.assistant.domains;
|
package nu.marginalia.geoip;
|
||||||
|
|
||||||
import com.opencsv.CSVReader;
|
import com.opencsv.CSVReader;
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -14,12 +13,7 @@ public class GeoIpDictionary {
|
|||||||
private volatile TreeMap<Long, IpRange> ranges = null;
|
private volatile TreeMap<Long, IpRange> ranges = null;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class);
|
private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class);
|
||||||
|
|
||||||
@AllArgsConstructor
|
record IpRange(long from, long to, String country) {}
|
||||||
static class IpRange {
|
|
||||||
public final long from;
|
|
||||||
public final long to;
|
|
||||||
public final String country;
|
|
||||||
}
|
|
||||||
|
|
||||||
public GeoIpDictionary() {
|
public GeoIpDictionary() {
|
||||||
Thread.ofPlatform().start(() -> {
|
Thread.ofPlatform().start(() -> {
|
||||||
@ -39,10 +33,28 @@ public class GeoIpDictionary {
|
|||||||
ranges = dict;
|
ranges = dict;
|
||||||
logger.info("Loaded {} IP ranges", ranges.size());
|
logger.info("Loaded {} IP ranges", ranges.size());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
ranges = new TreeMap<>();
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
finally {
|
||||||
|
this.notifyAll();
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isReady() {
|
||||||
|
return null != ranges;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean waitReady() {
|
||||||
|
while (null == ranges) {
|
||||||
|
try {
|
||||||
|
this.wait();
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getCountry(String ip) {
|
public String getCountry(String ip) {
|
@ -8,7 +8,6 @@ import org.apache.parquet.schema.*;
|
|||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
|
||||||
import java.sql.Array;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -1,5 +1,14 @@
|
|||||||
package nu.marginalia.model.processed;
|
package nu.marginalia.model.processed;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.ToString;
|
||||||
|
|
||||||
|
@AllArgsConstructor
|
||||||
|
@NoArgsConstructor
|
||||||
|
@EqualsAndHashCode
|
||||||
|
@ToString
|
||||||
public class DomainWithIp {
|
public class DomainWithIp {
|
||||||
public String domain;
|
public String domain;
|
||||||
public String ip;
|
public String ip;
|
||||||
|
@ -41,6 +41,7 @@ dependencies {
|
|||||||
|
|
||||||
implementation project(':code:libraries:guarded-regex')
|
implementation project(':code:libraries:guarded-regex')
|
||||||
implementation project(':code:libraries:easy-lsh')
|
implementation project(':code:libraries:easy-lsh')
|
||||||
|
implementation project(':code:libraries:geo-ip')
|
||||||
implementation project(':code:libraries:big-string')
|
implementation project(':code:libraries:big-string')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@ import nu.marginalia.converting.model.ProcessedDocument;
|
|||||||
import nu.marginalia.converting.processor.logic.links.LinkGraph;
|
import nu.marginalia.converting.processor.logic.links.LinkGraph;
|
||||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.crawling.model.*;
|
import nu.marginalia.crawling.model.*;
|
||||||
|
import nu.marginalia.geoip.GeoIpDictionary;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@ -30,6 +31,7 @@ public class DomainProcessor {
|
|||||||
private final AnchorTagsSource anchorTagsSource;
|
private final AnchorTagsSource anchorTagsSource;
|
||||||
private final AnchorTextKeywords anchorTextKeywords;
|
private final AnchorTextKeywords anchorTextKeywords;
|
||||||
private final LshDocumentDeduplicator documentDeduplicator;
|
private final LshDocumentDeduplicator documentDeduplicator;
|
||||||
|
private final GeoIpDictionary geoIpDictionary;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@ -38,17 +40,21 @@ public class DomainProcessor {
|
|||||||
SiteWords siteWords,
|
SiteWords siteWords,
|
||||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||||
AnchorTextKeywords anchorTextKeywords,
|
AnchorTextKeywords anchorTextKeywords,
|
||||||
LshDocumentDeduplicator documentDeduplicator) throws SQLException
|
LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException
|
||||||
{
|
{
|
||||||
this.documentProcessor = documentProcessor;
|
this.documentProcessor = documentProcessor;
|
||||||
this.siteWords = siteWords;
|
this.siteWords = siteWords;
|
||||||
this.anchorTextKeywords = anchorTextKeywords;
|
this.anchorTextKeywords = anchorTextKeywords;
|
||||||
this.documentDeduplicator = documentDeduplicator;
|
this.documentDeduplicator = documentDeduplicator;
|
||||||
this.anchorTagsSource = anchorTagsSourceFactory.create();
|
this.anchorTagsSource = anchorTagsSourceFactory.create();
|
||||||
|
this.geoIpDictionary = geoIpDictionary;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public ProcessedDomain process(SerializableCrawlDataStream dataStream) {
|
public ProcessedDomain process(SerializableCrawlDataStream dataStream) {
|
||||||
|
geoIpDictionary.waitReady();
|
||||||
|
|
||||||
var ret = new ProcessedDomain();
|
var ret = new ProcessedDomain();
|
||||||
List<ProcessedDocument> docs = new ArrayList<>();
|
List<ProcessedDocument> docs = new ArrayList<>();
|
||||||
|
|
||||||
@ -107,7 +113,14 @@ public class DomainProcessor {
|
|||||||
// Add late keywords and features from domain-level information
|
// Add late keywords and features from domain-level information
|
||||||
|
|
||||||
List<String> terms = new ArrayList<>();
|
List<String> terms = new ArrayList<>();
|
||||||
|
|
||||||
terms.add("ip:"+ip);
|
terms.add("ip:"+ip);
|
||||||
|
|
||||||
|
String geoIp = geoIpDictionary.getCountry(ip);
|
||||||
|
if (!geoIp.isBlank()) {
|
||||||
|
terms.add("geoip:"+geoIp.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
if (cookies) {
|
if (cookies) {
|
||||||
terms.add(HtmlFeature.COOKIES.getKeyword());
|
terms.add(HtmlFeature.COOKIES.getKeyword());
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||||
@ -75,7 +76,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
private CrawledDomain crawl(CrawlSpecRecord specs) {
|
private CrawledDomain crawl(CrawlSpecRecord specs) {
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
|
||||||
|
|
||||||
CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get();
|
CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get();
|
||||||
data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add);
|
data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add);
|
||||||
|
@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome;
|
|||||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
||||||
import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
|
import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
|
||||||
@ -56,6 +57,7 @@ public class CrawlerMain {
|
|||||||
|
|
||||||
private final UserAgent userAgent;
|
private final UserAgent userAgent;
|
||||||
private final MessageQueueFactory messageQueueFactory;
|
private final MessageQueueFactory messageQueueFactory;
|
||||||
|
private final DomainProber domainProber;
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final DbCrawlSpecProvider dbCrawlSpecProvider;
|
private final DbCrawlSpecProvider dbCrawlSpecProvider;
|
||||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||||
@ -75,7 +77,7 @@ public class CrawlerMain {
|
|||||||
@Inject
|
@Inject
|
||||||
public CrawlerMain(UserAgent userAgent,
|
public CrawlerMain(UserAgent userAgent,
|
||||||
ProcessHeartbeatImpl heartbeat,
|
ProcessHeartbeatImpl heartbeat,
|
||||||
MessageQueueFactory messageQueueFactory,
|
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
||||||
FileStorageService fileStorageService,
|
FileStorageService fileStorageService,
|
||||||
ProcessConfiguration processConfiguration,
|
ProcessConfiguration processConfiguration,
|
||||||
DbCrawlSpecProvider dbCrawlSpecProvider,
|
DbCrawlSpecProvider dbCrawlSpecProvider,
|
||||||
@ -84,6 +86,7 @@ public class CrawlerMain {
|
|||||||
this.heartbeat = heartbeat;
|
this.heartbeat = heartbeat;
|
||||||
this.userAgent = userAgent;
|
this.userAgent = userAgent;
|
||||||
this.messageQueueFactory = messageQueueFactory;
|
this.messageQueueFactory = messageQueueFactory;
|
||||||
|
this.domainProber = domainProber;
|
||||||
this.fileStorageService = fileStorageService;
|
this.fileStorageService = fileStorageService;
|
||||||
this.dbCrawlSpecProvider = dbCrawlSpecProvider;
|
this.dbCrawlSpecProvider = dbCrawlSpecProvider;
|
||||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||||
@ -219,7 +222,7 @@ public class CrawlerMain {
|
|||||||
|
|
||||||
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||||
|
|
||||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
|
var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, writer::accept);
|
||||||
int size = retreiver.fetch(domainLinks, reference);
|
int size = retreiver.fetch(domainLinks, reference);
|
||||||
|
|
||||||
workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size);
|
workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size);
|
||||||
|
@ -42,7 +42,7 @@ public class CrawlerRetreiver {
|
|||||||
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
|
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
|
||||||
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
|
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
|
||||||
|
|
||||||
private static final DomainProber domainProber = new DomainProber();
|
private final DomainProber domainProber;
|
||||||
private final SitemapRetriever sitemapRetriever;
|
private final SitemapRetriever sitemapRetriever;
|
||||||
private final DomainCrawlFrontier crawlFrontier;
|
private final DomainCrawlFrontier crawlFrontier;
|
||||||
|
|
||||||
@ -55,9 +55,11 @@ public class CrawlerRetreiver {
|
|||||||
private static final String documentWasSameTag = "SAME-BY-COMPARISON";
|
private static final String documentWasSameTag = "SAME-BY-COMPARISON";
|
||||||
|
|
||||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||||
|
DomainProber domainProber,
|
||||||
CrawlSpecRecord specs,
|
CrawlSpecRecord specs,
|
||||||
Consumer<SerializableCrawlData> writer) {
|
Consumer<SerializableCrawlData> writer) {
|
||||||
this.fetcher = fetcher;
|
this.fetcher = fetcher;
|
||||||
|
this.domainProber = domainProber;
|
||||||
|
|
||||||
domain = specs.domain;
|
domain = specs.domain;
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
|
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawling.model.CrawlerDomainStatus;
|
import nu.marginalia.crawling.model.CrawlerDomainStatus;
|
||||||
@ -11,17 +13,21 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
public class DomainProber {
|
public class DomainProber {
|
||||||
private final Logger logger = LoggerFactory.getLogger(DomainProber.class);
|
private final Logger logger = LoggerFactory.getLogger(DomainProber.class);
|
||||||
private static IpBlockList ipBlockList;
|
private final Predicate<EdgeDomain> domainBlacklist;
|
||||||
|
|
||||||
static {
|
@Inject
|
||||||
try {
|
public DomainProber(IpBlockList ipBlockList) {
|
||||||
ipBlockList = new IpBlockList(new GeoIpBlocklist());
|
this.domainBlacklist = ipBlockList::isAllowed;
|
||||||
} catch (Exception e) {
|
}
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
/** For testing */
|
||||||
|
public DomainProber(Predicate<EdgeDomain> domainBlacklist) {
|
||||||
|
this.domainBlacklist = domainBlacklist;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** To detect problems early we do a probing request to the domain before we start crawling it properly.
|
/** To detect problems early we do a probing request to the domain before we start crawling it properly.
|
||||||
@ -37,7 +43,7 @@ public class DomainProber {
|
|||||||
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
|
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ipBlockList.isAllowed(firstUrlInQueue.domain))
|
if (!domainBlacklist.test(firstUrlInQueue.domain))
|
||||||
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
|
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
|
||||||
|
|
||||||
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
|
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
|
||||||
@ -62,7 +68,7 @@ public class DomainProber {
|
|||||||
/** This domain redirects to another domain */
|
/** This domain redirects to another domain */
|
||||||
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
|
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
|
||||||
|
|
||||||
/** If the retreivala of the probed url was successful, return the url as it was fetched
|
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
||||||
* (which may be different from the url we probed, if we attempted another URL schema).
|
* (which may be different from the url we probed, if we attempted another URL schema).
|
||||||
*
|
*
|
||||||
* @param probedUrl The url we successfully probed
|
* @param probedUrl The url we successfully probed
|
||||||
|
@ -15,7 +15,6 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
|
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
|
||||||
import nu.marginalia.crawl.retreival.logic.ContentTypeParser;
|
import nu.marginalia.crawl.retreival.logic.ContentTypeParser;
|
||||||
import okhttp3.*;
|
import okhttp3.*;
|
||||||
import org.apache.commons.collections4.queue.PredicatedQueue;
|
|
||||||
import org.apache.commons.io.input.BOMInputStream;
|
import org.apache.commons.io.input.BOMInputStream;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -87,7 +86,10 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
|
public HttpFetcherImpl(@Named("user-agent") String userAgent,
|
||||||
|
Dispatcher dispatcher,
|
||||||
|
ConnectionPool connectionPool)
|
||||||
|
{
|
||||||
this.client = createClient(dispatcher, connectionPool);
|
this.client = createClient(dispatcher, connectionPool);
|
||||||
this.userAgent = userAgent;
|
this.userAgent = userAgent;
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.crawling.retreival;
|
|||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.*;
|
import nu.marginalia.crawl.retreival.fetcher.*;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||||
@ -68,7 +69,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add)
|
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add)
|
||||||
.fetch();
|
.fetch();
|
||||||
|
|
||||||
out.forEach(System.out::println);
|
out.forEach(System.out::println);
|
||||||
@ -80,7 +81,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add)
|
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add)
|
||||||
.fetch();
|
.fetch();
|
||||||
|
|
||||||
out.forEach(System.out::println);
|
out.forEach(System.out::println);
|
||||||
@ -94,7 +95,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add)
|
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add)
|
||||||
.fetch();
|
.fetch();
|
||||||
|
|
||||||
out.forEach(System.out::println);
|
out.forEach(System.out::println);
|
||||||
|
@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
|
|||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||||
@ -53,7 +54,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
|
||||||
|
|
||||||
var fetchedUrls =
|
var fetchedUrls =
|
||||||
data.stream().filter(CrawledDocument.class::isInstance)
|
data.stream().filter(CrawledDocument.class::isInstance)
|
||||||
@ -82,7 +83,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
|
||||||
|
|
||||||
data.stream().filter(CrawledDocument.class::isInstance)
|
data.stream().filter(CrawledDocument.class::isInstance)
|
||||||
.map(CrawledDocument.class::cast)
|
.map(CrawledDocument.class::cast)
|
||||||
@ -118,7 +119,7 @@ class CrawlerRetreiverTest {
|
|||||||
var writer = new CrawledDomainWriter(out, specs.domain, "idid");
|
var writer = new CrawledDomainWriter(out, specs.domain, "idid");
|
||||||
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, specs, d -> {
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
|
||||||
data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
|
data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
|
||||||
if (d instanceof CrawledDocument doc) {
|
if (d instanceof CrawledDocument doc) {
|
||||||
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
||||||
@ -136,7 +137,7 @@ class CrawlerRetreiverTest {
|
|||||||
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
||||||
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, specs, d -> {
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
|
||||||
if (d instanceof CrawledDocument doc) {
|
if (d instanceof CrawledDocument doc) {
|
||||||
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
||||||
}
|
}
|
||||||
|
@ -99,7 +99,6 @@
|
|||||||
</section>
|
</section>
|
||||||
<section id="legal">
|
<section id="legal">
|
||||||
<h1>Policies</h1>
|
<h1>Policies</h1>
|
||||||
|
|
||||||
This website complies with the GDPR by <em>not collecting any personal
|
This website complies with the GDPR by <em>not collecting any personal
|
||||||
information</em>, and with the EU Cookie Directive by <em>not using
|
information</em>, and with the EU Cookie Directive by <em>not using
|
||||||
cookies</em>. <a href="https://memex.marginalia.nu/projects/edge/privacy.gmi">More Information</a>.
|
cookies</em>. <a href="https://memex.marginalia.nu/projects/edge/privacy.gmi">More Information</a>.
|
||||||
@ -109,8 +108,13 @@
|
|||||||
<h1> Open Source </h1>
|
<h1> Open Source </h1>
|
||||||
The search engine is open source with an AGPL license. The sources can be perused at
|
The search engine is open source with an AGPL license. The sources can be perused at
|
||||||
<tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>.
|
<tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>.
|
||||||
|
<h1>Data Sources</h1>
|
||||||
|
IP geolocation is sourced from the IP2Location LITE data available from
|
||||||
|
<a rel="external noopener nofollow" href="https://lite.ip2location.com/">https://lite.ip2location.com/</a>
|
||||||
|
under
|
||||||
|
<a rel="external noopener nofollow" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA 4.0</a>.
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
</footer>
|
</footer>
|
||||||
|
|
||||||
<script src="/tts.js"></script>
|
<script src="/tts.js"></script>
|
||||||
|
@ -33,6 +33,7 @@ dependencies {
|
|||||||
|
|
||||||
implementation project(':code:features-search:screenshots')
|
implementation project(':code:features-search:screenshots')
|
||||||
|
|
||||||
|
implementation project(':code:libraries:geo-ip')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
implementation project(':code:libraries:term-frequency-dict')
|
implementation project(':code:libraries:term-frequency-dict')
|
||||||
|
|
||||||
|
@ -1,9 +1,8 @@
|
|||||||
package nu.marginalia.assistant.domains;
|
package nu.marginalia.assistant.domains;
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import lombok.SneakyThrows;
|
import nu.marginalia.geoip.GeoIpDictionary;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
|
||||||
import nu.marginalia.db.DbDomainQueries;
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
import nu.marginalia.assistant.client.model.DomainInformation;
|
import nu.marginalia.assistant.client.model.DomainInformation;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -12,6 +12,7 @@ include 'code:services-application:dating-service'
|
|||||||
include 'code:services-application:explorer-service'
|
include 'code:services-application:explorer-service'
|
||||||
|
|
||||||
include 'code:libraries:array'
|
include 'code:libraries:array'
|
||||||
|
include 'code:libraries:geo-ip'
|
||||||
include 'code:libraries:btree'
|
include 'code:libraries:btree'
|
||||||
include 'code:libraries:easy-lsh'
|
include 'code:libraries:easy-lsh'
|
||||||
include 'code:libraries:guarded-regex'
|
include 'code:libraries:guarded-regex'
|
||||||
|
Loading…
Reference in New Issue
Block a user