(*) Refactor GeoIP-related code
In this commit, GeoIP-related classes are refactored and relocated to a common library as they are shared across multiple services. The crawler is refactored to enable the GeoIpBlocklist to use the new GeoIpDictionary as the base of its decisions. The converter is modified ot query this data to add a geoip:-keyword to documents to permit limiting a search to the country of the hosting server. The commit also adds due BY-SA attribution in the search engine footer for the source of the IP geolocation data.
This commit is contained in:
parent
84b4158555
commit
f655ec5a5c
@ -15,6 +15,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:guarded-regex')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
|
||||
implementation libs.notnull
|
||||
|
||||
|
@ -1,73 +1,31 @@
|
||||
package nu.marginalia.ip_blocklist;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.opencsv.CSVReader;
|
||||
import com.opencsv.exceptions.CsvValidationException;
|
||||
import lombok.AllArgsConstructor;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
|
||||
@Singleton
|
||||
public class GeoIpBlocklist {
|
||||
private final TreeMap<Long, GeoIpBlocklist.IpRange> ranges = new TreeMap<>();
|
||||
|
||||
/** These countries are extremely overrepresented among the problematic and spammy domains,
|
||||
* and blocking them is by far the most effective spam mitigation technique. Sucks we throw
|
||||
* babies out with the bathwater, but it's undeniably effective.
|
||||
*/
|
||||
private final Set<String> blacklist = Set.of("CN", "HK");
|
||||
private final Set<String> graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA");
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class);
|
||||
|
||||
@AllArgsConstructor
|
||||
static class IpRange {
|
||||
public final long from;
|
||||
public final long to;
|
||||
public final String country;
|
||||
}
|
||||
private final GeoIpDictionary ipDictionary;
|
||||
|
||||
public GeoIpBlocklist() throws IOException, CsvValidationException {
|
||||
var resource = WmsaHome.getIPLocationDatabse();
|
||||
|
||||
try (var reader = new CSVReader(new FileReader(resource.toFile()))) {
|
||||
for (;;) {
|
||||
String[] vals = reader.readNext();
|
||||
if (vals == null) {
|
||||
break;
|
||||
}
|
||||
if (!(blacklist.contains(vals[2]) || graylist.contains(vals[2]))) {
|
||||
continue;
|
||||
}
|
||||
var range = new GeoIpBlocklist.IpRange(Long.parseLong(vals[0]),
|
||||
Long.parseLong(vals[1]),
|
||||
vals[2]);
|
||||
ranges.put(range.from, range);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Loaded {} IP ranges", ranges.size());
|
||||
}
|
||||
|
||||
public String getCountry(InetAddress address) {
|
||||
byte[] bytes = address.getAddress();
|
||||
long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF);
|
||||
|
||||
Long key = ranges.floorKey(ival);
|
||||
if (null == key) {
|
||||
return "-";
|
||||
}
|
||||
|
||||
var range = ranges.get(key);
|
||||
if (ival >= key && ival < range.to) {
|
||||
return range.country;
|
||||
}
|
||||
|
||||
return "-";
|
||||
@Inject
|
||||
public GeoIpBlocklist(GeoIpDictionary ipDictionary) {
|
||||
this.ipDictionary = ipDictionary;
|
||||
ipDictionary.waitReady();
|
||||
}
|
||||
|
||||
public boolean isAllowed(EdgeDomain domain) {
|
||||
@ -85,7 +43,7 @@ public class GeoIpBlocklist {
|
||||
|
||||
public String getCountry(EdgeDomain domain) {
|
||||
try {
|
||||
return getCountry(InetAddressCache.getAddress(domain));
|
||||
return ipDictionary.getCountry(InetAddressCache.getAddress(domain));
|
||||
}
|
||||
catch (Throwable ex) {
|
||||
logger.debug("Failed to resolve {}", domain);
|
||||
|
@ -11,7 +11,7 @@ import java.util.concurrent.TimeUnit;
|
||||
// We don't want to torture the DNS by resolving the same links over and over and over again
|
||||
|
||||
public class InetAddressCache {
|
||||
private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(10_000_000).expireAfterAccess(1, TimeUnit.HOURS).build();
|
||||
private static final Cache<EdgeDomain, InetAddress> cache = CacheBuilder.newBuilder().maximumSize(1_000_000).expireAfterAccess(1, TimeUnit.HOURS).build();
|
||||
public static InetAddress getAddress(EdgeDomain domain) throws Throwable {
|
||||
try {
|
||||
return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress()));
|
||||
|
24
code/libraries/geo-ip/build.gradle
Normal file
24
code/libraries/geo-ip/build.gradle
Normal file
@ -0,0 +1,24 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(21))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.opencsv
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
6
code/libraries/geo-ip/readme.md
Normal file
6
code/libraries/geo-ip/readme.md
Normal file
@ -0,0 +1,6 @@
|
||||
This micro library handles the GeoIP lookups, mappings from IP addresses
|
||||
to country codes.
|
||||
|
||||
It uses the free ip2location lite database, which is
|
||||
available from [https://lite.ip2location.com/database/ip-country](https://lite.ip2location.com/database/ip-country)
|
||||
under a CC-BY-SA 4.0 license.
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.assistant.domains;
|
||||
package nu.marginalia.geoip;
|
||||
|
||||
import com.opencsv.CSVReader;
|
||||
import lombok.AllArgsConstructor;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -14,12 +13,7 @@ public class GeoIpDictionary {
|
||||
private volatile TreeMap<Long, IpRange> ranges = null;
|
||||
private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class);
|
||||
|
||||
@AllArgsConstructor
|
||||
static class IpRange {
|
||||
public final long from;
|
||||
public final long to;
|
||||
public final String country;
|
||||
}
|
||||
record IpRange(long from, long to, String country) {}
|
||||
|
||||
public GeoIpDictionary() {
|
||||
Thread.ofPlatform().start(() -> {
|
||||
@ -39,10 +33,28 @@ public class GeoIpDictionary {
|
||||
ranges = dict;
|
||||
logger.info("Loaded {} IP ranges", ranges.size());
|
||||
} catch (Exception e) {
|
||||
ranges = new TreeMap<>();
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
finally {
|
||||
this.notifyAll();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public boolean isReady() {
|
||||
return null != ranges;
|
||||
}
|
||||
|
||||
public boolean waitReady() {
|
||||
while (null == ranges) {
|
||||
try {
|
||||
this.wait();
|
||||
} catch (InterruptedException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public String getCountry(String ip) {
|
@ -8,7 +8,6 @@ import org.apache.parquet.schema.*;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
import java.sql.Array;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -1,5 +1,14 @@
|
||||
package nu.marginalia.model.processed;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.ToString;
|
||||
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode
|
||||
@ToString
|
||||
public class DomainWithIp {
|
||||
public String domain;
|
||||
public String ip;
|
||||
|
@ -41,6 +41,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:libraries:guarded-regex')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
|
@ -10,6 +10,7 @@ import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkGraph;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.*;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -30,6 +31,7 @@ public class DomainProcessor {
|
||||
private final AnchorTagsSource anchorTagsSource;
|
||||
private final AnchorTextKeywords anchorTextKeywords;
|
||||
private final LshDocumentDeduplicator documentDeduplicator;
|
||||
private final GeoIpDictionary geoIpDictionary;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@ -38,17 +40,21 @@ public class DomainProcessor {
|
||||
SiteWords siteWords,
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
AnchorTextKeywords anchorTextKeywords,
|
||||
LshDocumentDeduplicator documentDeduplicator) throws SQLException
|
||||
LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException
|
||||
{
|
||||
this.documentProcessor = documentProcessor;
|
||||
this.siteWords = siteWords;
|
||||
this.anchorTextKeywords = anchorTextKeywords;
|
||||
this.documentDeduplicator = documentDeduplicator;
|
||||
this.anchorTagsSource = anchorTagsSourceFactory.create();
|
||||
this.geoIpDictionary = geoIpDictionary;
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public ProcessedDomain process(SerializableCrawlDataStream dataStream) {
|
||||
geoIpDictionary.waitReady();
|
||||
|
||||
var ret = new ProcessedDomain();
|
||||
List<ProcessedDocument> docs = new ArrayList<>();
|
||||
|
||||
@ -107,7 +113,14 @@ public class DomainProcessor {
|
||||
// Add late keywords and features from domain-level information
|
||||
|
||||
List<String> terms = new ArrayList<>();
|
||||
|
||||
terms.add("ip:"+ip);
|
||||
|
||||
String geoIp = geoIpDictionary.getCountry(ip);
|
||||
if (!geoIp.isBlank()) {
|
||||
terms.add("geoip:"+geoIp.toLowerCase());
|
||||
}
|
||||
|
||||
if (cookies) {
|
||||
terms.add(HtmlFeature.COOKIES.getKeyword());
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
@ -75,7 +76,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
private CrawledDomain crawl(CrawlSpecRecord specs) {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
|
||||
|
||||
CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get();
|
||||
data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add);
|
||||
|
@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
||||
import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
|
||||
@ -56,6 +57,7 @@ public class CrawlerMain {
|
||||
|
||||
private final UserAgent userAgent;
|
||||
private final MessageQueueFactory messageQueueFactory;
|
||||
private final DomainProber domainProber;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final DbCrawlSpecProvider dbCrawlSpecProvider;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
@ -75,7 +77,7 @@ public class CrawlerMain {
|
||||
@Inject
|
||||
public CrawlerMain(UserAgent userAgent,
|
||||
ProcessHeartbeatImpl heartbeat,
|
||||
MessageQueueFactory messageQueueFactory,
|
||||
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
||||
FileStorageService fileStorageService,
|
||||
ProcessConfiguration processConfiguration,
|
||||
DbCrawlSpecProvider dbCrawlSpecProvider,
|
||||
@ -84,6 +86,7 @@ public class CrawlerMain {
|
||||
this.heartbeat = heartbeat;
|
||||
this.userAgent = userAgent;
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.domainProber = domainProber;
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.dbCrawlSpecProvider = dbCrawlSpecProvider;
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
@ -219,7 +222,7 @@ public class CrawlerMain {
|
||||
|
||||
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
|
||||
var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, writer::accept);
|
||||
int size = retreiver.fetch(domainLinks, reference);
|
||||
|
||||
workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size);
|
||||
|
@ -42,7 +42,7 @@ public class CrawlerRetreiver {
|
||||
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
|
||||
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
|
||||
|
||||
private static final DomainProber domainProber = new DomainProber();
|
||||
private final DomainProber domainProber;
|
||||
private final SitemapRetriever sitemapRetriever;
|
||||
private final DomainCrawlFrontier crawlFrontier;
|
||||
|
||||
@ -55,9 +55,11 @@ public class CrawlerRetreiver {
|
||||
private static final String documentWasSameTag = "SAME-BY-COMPARISON";
|
||||
|
||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||
DomainProber domainProber,
|
||||
CrawlSpecRecord specs,
|
||||
Consumer<SerializableCrawlData> writer) {
|
||||
this.fetcher = fetcher;
|
||||
this.domainProber = domainProber;
|
||||
|
||||
domain = specs.domain;
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawling.model.CrawlerDomainStatus;
|
||||
@ -11,17 +13,21 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
@Singleton
|
||||
public class DomainProber {
|
||||
private final Logger logger = LoggerFactory.getLogger(DomainProber.class);
|
||||
private static IpBlockList ipBlockList;
|
||||
private final Predicate<EdgeDomain> domainBlacklist;
|
||||
|
||||
static {
|
||||
try {
|
||||
ipBlockList = new IpBlockList(new GeoIpBlocklist());
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
@Inject
|
||||
public DomainProber(IpBlockList ipBlockList) {
|
||||
this.domainBlacklist = ipBlockList::isAllowed;
|
||||
}
|
||||
|
||||
/** For testing */
|
||||
public DomainProber(Predicate<EdgeDomain> domainBlacklist) {
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
}
|
||||
|
||||
/** To detect problems early we do a probing request to the domain before we start crawling it properly.
|
||||
@ -37,7 +43,7 @@ public class DomainProber {
|
||||
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
|
||||
}
|
||||
|
||||
if (!ipBlockList.isAllowed(firstUrlInQueue.domain))
|
||||
if (!domainBlacklist.test(firstUrlInQueue.domain))
|
||||
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
|
||||
|
||||
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
|
||||
@ -62,7 +68,7 @@ public class DomainProber {
|
||||
/** This domain redirects to another domain */
|
||||
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
|
||||
|
||||
/** If the retreivala of the probed url was successful, return the url as it was fetched
|
||||
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
||||
* (which may be different from the url we probed, if we attempted another URL schema).
|
||||
*
|
||||
* @param probedUrl The url we successfully probed
|
||||
|
@ -15,7 +15,6 @@ import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
|
||||
import nu.marginalia.crawl.retreival.logic.ContentTypeParser;
|
||||
import okhttp3.*;
|
||||
import org.apache.commons.collections4.queue.PredicatedQueue;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
@ -87,7 +86,10 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
@Inject
|
||||
public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
|
||||
public HttpFetcherImpl(@Named("user-agent") String userAgent,
|
||||
Dispatcher dispatcher,
|
||||
ConnectionPool connectionPool)
|
||||
{
|
||||
this.client = createClient(dispatcher, connectionPool);
|
||||
this.userAgent = userAgent;
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.crawling.retreival;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.*;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||
@ -68,7 +69,7 @@ public class CrawlerMockFetcherTest {
|
||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
||||
|
||||
new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add)
|
||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add)
|
||||
.fetch();
|
||||
|
||||
out.forEach(System.out::println);
|
||||
@ -80,7 +81,7 @@ public class CrawlerMockFetcherTest {
|
||||
|
||||
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
||||
|
||||
new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add)
|
||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add)
|
||||
.fetch();
|
||||
|
||||
out.forEach(System.out::println);
|
||||
@ -94,7 +95,7 @@ public class CrawlerMockFetcherTest {
|
||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
||||
|
||||
new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add)
|
||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add)
|
||||
.fetch();
|
||||
|
||||
out.forEach(System.out::println);
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
@ -53,7 +54,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
|
||||
|
||||
var fetchedUrls =
|
||||
data.stream().filter(CrawledDocument.class::isInstance)
|
||||
@ -82,7 +83,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
|
||||
|
||||
data.stream().filter(CrawledDocument.class::isInstance)
|
||||
.map(CrawledDocument.class::cast)
|
||||
@ -118,7 +119,7 @@ class CrawlerRetreiverTest {
|
||||
var writer = new CrawledDomainWriter(out, specs.domain, "idid");
|
||||
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
||||
|
||||
new CrawlerRetreiver(httpFetcher, specs, d -> {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
|
||||
data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
|
||||
if (d instanceof CrawledDocument doc) {
|
||||
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
||||
@ -136,7 +137,7 @@ class CrawlerRetreiverTest {
|
||||
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
||||
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
||||
|
||||
new CrawlerRetreiver(httpFetcher, specs, d -> {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
|
||||
if (d instanceof CrawledDocument doc) {
|
||||
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
||||
}
|
||||
|
@ -99,7 +99,6 @@
|
||||
</section>
|
||||
<section id="legal">
|
||||
<h1>Policies</h1>
|
||||
|
||||
This website complies with the GDPR by <em>not collecting any personal
|
||||
information</em>, and with the EU Cookie Directive by <em>not using
|
||||
cookies</em>. <a href="https://memex.marginalia.nu/projects/edge/privacy.gmi">More Information</a>.
|
||||
@ -109,8 +108,13 @@
|
||||
<h1> Open Source </h1>
|
||||
The search engine is open source with an AGPL license. The sources can be perused at
|
||||
<tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>.
|
||||
|
||||
<h1>Data Sources</h1>
|
||||
IP geolocation is sourced from the IP2Location LITE data available from
|
||||
<a rel="external noopener nofollow" href="https://lite.ip2location.com/">https://lite.ip2location.com/</a>
|
||||
under
|
||||
<a rel="external noopener nofollow" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA 4.0</a>.
|
||||
</section>
|
||||
|
||||
</footer>
|
||||
|
||||
<script src="/tts.js"></script>
|
||||
|
@ -33,6 +33,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:features-search:screenshots')
|
||||
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
|
@ -1,9 +1,8 @@
|
||||
package nu.marginalia.assistant.domains;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.assistant.client.model.DomainInformation;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -12,6 +12,7 @@ include 'code:services-application:dating-service'
|
||||
include 'code:services-application:explorer-service'
|
||||
|
||||
include 'code:libraries:array'
|
||||
include 'code:libraries:geo-ip'
|
||||
include 'code:libraries:btree'
|
||||
include 'code:libraries:easy-lsh'
|
||||
include 'code:libraries:guarded-regex'
|
||||
|
Loading…
Reference in New Issue
Block a user