diff --git a/code/features-crawl/crawl-blocklist/build.gradle b/code/features-crawl/crawl-blocklist/build.gradle index c131e97b..8288aa0c 100644 --- a/code/features-crawl/crawl-blocklist/build.gradle +++ b/code/features-crawl/crawl-blocklist/build.gradle @@ -15,6 +15,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:libraries:guarded-regex') + implementation project(':code:libraries:geo-ip') implementation libs.notnull diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java index ba896317..79ca6847 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java @@ -1,73 +1,31 @@ package nu.marginalia.ip_blocklist; +import com.google.inject.Inject; import com.google.inject.Singleton; -import com.opencsv.CSVReader; -import com.opencsv.exceptions.CsvValidationException; -import lombok.AllArgsConstructor; -import nu.marginalia.WmsaHome; +import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.FileReader; -import java.io.IOException; -import java.net.InetAddress; import java.util.Set; -import java.util.TreeMap; @Singleton public class GeoIpBlocklist { - private final TreeMap ranges = new TreeMap<>(); - + /** These countries are extremely overrepresented among the problematic and spammy domains, + * and blocking them is by far the most effective spam mitigation technique. Sucks we throw + * babies out with the bathwater, but it's undeniably effective. + */ private final Set blacklist = Set.of("CN", "HK"); private final Set graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA"); private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class); - @AllArgsConstructor - static class IpRange { - public final long from; - public final long to; - public final String country; - } + private final GeoIpDictionary ipDictionary; - public GeoIpBlocklist() throws IOException, CsvValidationException { - var resource = WmsaHome.getIPLocationDatabse(); - - try (var reader = new CSVReader(new FileReader(resource.toFile()))) { - for (;;) { - String[] vals = reader.readNext(); - if (vals == null) { - break; - } - if (!(blacklist.contains(vals[2]) || graylist.contains(vals[2]))) { - continue; - } - var range = new GeoIpBlocklist.IpRange(Long.parseLong(vals[0]), - Long.parseLong(vals[1]), - vals[2]); - ranges.put(range.from, range); - } - } - - logger.info("Loaded {} IP ranges", ranges.size()); - } - - public String getCountry(InetAddress address) { - byte[] bytes = address.getAddress(); - long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF); - - Long key = ranges.floorKey(ival); - if (null == key) { - return "-"; - } - - var range = ranges.get(key); - if (ival >= key && ival < range.to) { - return range.country; - } - - return "-"; + @Inject + public GeoIpBlocklist(GeoIpDictionary ipDictionary) { + this.ipDictionary = ipDictionary; + ipDictionary.waitReady(); } public boolean isAllowed(EdgeDomain domain) { @@ -85,7 +43,7 @@ public class GeoIpBlocklist { public String getCountry(EdgeDomain domain) { try { - return getCountry(InetAddressCache.getAddress(domain)); + return ipDictionary.getCountry(InetAddressCache.getAddress(domain)); } catch (Throwable ex) { logger.debug("Failed to resolve {}", domain); diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java index 728a1f65..ba9a7948 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java @@ -11,7 +11,7 @@ import java.util.concurrent.TimeUnit; // We don't want to torture the DNS by resolving the same links over and over and over again public class InetAddressCache { - private static final Cache cache = CacheBuilder.newBuilder().maximumSize(10_000_000).expireAfterAccess(1, TimeUnit.HOURS).build(); + private static final Cache cache = CacheBuilder.newBuilder().maximumSize(1_000_000).expireAfterAccess(1, TimeUnit.HOURS).build(); public static InetAddress getAddress(EdgeDomain domain) throws Throwable { try { return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress())); diff --git a/code/libraries/geo-ip/build.gradle b/code/libraries/geo-ip/build.gradle new file mode 100644 index 00000000..b0180ef8 --- /dev/null +++ b/code/libraries/geo-ip/build.gradle @@ -0,0 +1,24 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +dependencies { + implementation project(':code:common:config') + + implementation libs.bundles.slf4j + implementation libs.opencsv + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} diff --git a/code/libraries/geo-ip/readme.md b/code/libraries/geo-ip/readme.md new file mode 100644 index 00000000..2a81b04b --- /dev/null +++ b/code/libraries/geo-ip/readme.md @@ -0,0 +1,6 @@ +This micro library handles the GeoIP lookups, mappings from IP addresses +to country codes. + +It uses the free ip2location lite database, which is +available from [https://lite.ip2location.com/database/ip-country](https://lite.ip2location.com/database/ip-country) +under a CC-BY-SA 4.0 license. \ No newline at end of file diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java similarity index 80% rename from code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java rename to code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java index e250761e..83789905 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java +++ b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java @@ -1,7 +1,6 @@ -package nu.marginalia.assistant.domains; +package nu.marginalia.geoip; import com.opencsv.CSVReader; -import lombok.AllArgsConstructor; import nu.marginalia.WmsaHome; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -14,12 +13,7 @@ public class GeoIpDictionary { private volatile TreeMap ranges = null; private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class); - @AllArgsConstructor - static class IpRange { - public final long from; - public final long to; - public final String country; - } + record IpRange(long from, long to, String country) {} public GeoIpDictionary() { Thread.ofPlatform().start(() -> { @@ -39,10 +33,28 @@ public class GeoIpDictionary { ranges = dict; logger.info("Loaded {} IP ranges", ranges.size()); } catch (Exception e) { + ranges = new TreeMap<>(); throw new RuntimeException(e); } + finally { + this.notifyAll(); + } }); + } + public boolean isReady() { + return null != ranges; + } + + public boolean waitReady() { + while (null == ranges) { + try { + this.wait(); + } catch (InterruptedException e) { + return false; + } + } + return true; } public String getCountry(String ip) { diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java index 6b3491bf..b696829f 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java @@ -8,7 +8,6 @@ import org.apache.parquet.schema.*; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import java.sql.Array; import java.util.ArrayList; import java.util.List; diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java index bedae4d5..3782b1b2 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java @@ -1,5 +1,14 @@ package nu.marginalia.model.processed; +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.ToString; + +@AllArgsConstructor +@NoArgsConstructor +@EqualsAndHashCode +@ToString public class DomainWithIp { public String domain; public String ip; diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index faa952fb..979260df 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -41,6 +41,7 @@ dependencies { implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:easy-lsh') + implementation project(':code:libraries:geo-ip') implementation project(':code:libraries:big-string') implementation project(':code:libraries:language-processing') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 00a05257..df682d77 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -10,6 +10,7 @@ import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.logic.links.LinkGraph; import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.*; +import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; @@ -30,6 +31,7 @@ public class DomainProcessor { private final AnchorTagsSource anchorTagsSource; private final AnchorTextKeywords anchorTextKeywords; private final LshDocumentDeduplicator documentDeduplicator; + private final GeoIpDictionary geoIpDictionary; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -38,17 +40,21 @@ public class DomainProcessor { SiteWords siteWords, AnchorTagsSourceFactory anchorTagsSourceFactory, AnchorTextKeywords anchorTextKeywords, - LshDocumentDeduplicator documentDeduplicator) throws SQLException + LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException { this.documentProcessor = documentProcessor; this.siteWords = siteWords; this.anchorTextKeywords = anchorTextKeywords; this.documentDeduplicator = documentDeduplicator; this.anchorTagsSource = anchorTagsSourceFactory.create(); + this.geoIpDictionary = geoIpDictionary; + } @SneakyThrows public ProcessedDomain process(SerializableCrawlDataStream dataStream) { + geoIpDictionary.waitReady(); + var ret = new ProcessedDomain(); List docs = new ArrayList<>(); @@ -107,7 +113,14 @@ public class DomainProcessor { // Add late keywords and features from domain-level information List terms = new ArrayList<>(); + terms.add("ip:"+ip); + + String geoIp = geoIpDictionary.getCountry(ip); + if (!geoIp.isBlank()) { + terms.add("geoip:"+geoIp.toLowerCase()); + } + if (cookies) { terms.add(HtmlFeature.COOKIES.getKeyword()); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 58d8a486..7ef056d2 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -6,6 +6,7 @@ import lombok.SneakyThrows; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.io.SerializableCrawlDataStream; @@ -75,7 +76,7 @@ public class CrawlingThenConvertingIntegrationTest { private CrawledDomain crawl(CrawlSpecRecord specs) { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index dc76abde..f824d815 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.crawl.retreival.CrawlDataReference; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.crawl.spec.DbCrawlSpecProvider; @@ -56,6 +57,7 @@ public class CrawlerMain { private final UserAgent userAgent; private final MessageQueueFactory messageQueueFactory; + private final DomainProber domainProber; private final FileStorageService fileStorageService; private final DbCrawlSpecProvider dbCrawlSpecProvider; private final AnchorTagsSourceFactory anchorTagsSourceFactory; @@ -75,7 +77,7 @@ public class CrawlerMain { @Inject public CrawlerMain(UserAgent userAgent, ProcessHeartbeatImpl heartbeat, - MessageQueueFactory messageQueueFactory, + MessageQueueFactory messageQueueFactory, DomainProber domainProber, FileStorageService fileStorageService, ProcessConfiguration processConfiguration, DbCrawlSpecProvider dbCrawlSpecProvider, @@ -84,6 +86,7 @@ public class CrawlerMain { this.heartbeat = heartbeat; this.userAgent = userAgent; this.messageQueueFactory = messageQueueFactory; + this.domainProber = domainProber; this.fileStorageService = fileStorageService; this.dbCrawlSpecProvider = dbCrawlSpecProvider; this.anchorTagsSourceFactory = anchorTagsSourceFactory; @@ -219,7 +222,7 @@ public class CrawlerMain { var domainLinks = anchorTagsSource.getAnchorTags(domain); - var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); + var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, writer::accept); int size = retreiver.fetch(domainLinks, reference); workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index ce5ecb89..b32e0b6c 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -42,7 +42,7 @@ public class CrawlerRetreiver { private static final UrlBlocklist urlBlocklist = new UrlBlocklist(); private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); - private static final DomainProber domainProber = new DomainProber(); + private final DomainProber domainProber; private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; @@ -55,9 +55,11 @@ public class CrawlerRetreiver { private static final String documentWasSameTag = "SAME-BY-COMPARISON"; public CrawlerRetreiver(HttpFetcher fetcher, + DomainProber domainProber, CrawlSpecRecord specs, Consumer writer) { this.fetcher = fetcher; + this.domainProber = domainProber; domain = specs.domain; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java index 67f006d4..fcc005a8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java @@ -1,5 +1,7 @@ package nu.marginalia.crawl.retreival; +import com.google.inject.Inject; +import com.google.inject.Singleton; import nu.marginalia.crawl.retreival.fetcher.FetchResultState; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawling.model.CrawlerDomainStatus; @@ -11,17 +13,21 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import java.util.function.Predicate; +@Singleton public class DomainProber { private final Logger logger = LoggerFactory.getLogger(DomainProber.class); - private static IpBlockList ipBlockList; + private final Predicate domainBlacklist; - static { - try { - ipBlockList = new IpBlockList(new GeoIpBlocklist()); - } catch (Exception e) { - throw new RuntimeException(e); - } + @Inject + public DomainProber(IpBlockList ipBlockList) { + this.domainBlacklist = ipBlockList::isAllowed; + } + + /** For testing */ + public DomainProber(Predicate domainBlacklist) { + this.domainBlacklist = domainBlacklist; } /** To detect problems early we do a probing request to the domain before we start crawling it properly. @@ -37,7 +43,7 @@ public class DomainProber { return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs"); } - if (!ipBlockList.isAllowed(firstUrlInQueue.domain)) + if (!domainBlacklist.test(firstUrlInQueue.domain)) return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed"); var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null)); @@ -62,7 +68,7 @@ public class DomainProber { /** This domain redirects to another domain */ public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {} - /** If the retreivala of the probed url was successful, return the url as it was fetched + /** If the retrieval of the probed url was successful, return the url as it was fetched * (which may be different from the url we probed, if we attempted another URL schema). * * @param probedUrl The url we successfully probed diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 041ae08d..5720ef34 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -15,7 +15,6 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; import nu.marginalia.crawl.retreival.logic.ContentTypeParser; import okhttp3.*; -import org.apache.commons.collections4.queue.PredicatedQueue; import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; @@ -87,7 +86,10 @@ public class HttpFetcherImpl implements HttpFetcher { } @Inject - public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) { + public HttpFetcherImpl(@Named("user-agent") String userAgent, + Dispatcher dispatcher, + ConnectionPool connectionPool) + { this.client = createClient(dispatcher, connectionPool); this.userAgent = userAgent; } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index c0df397f..b65e5ae6 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -3,6 +3,7 @@ package nu.marginalia.crawling.retreival; import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.*; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; @@ -68,7 +69,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add) .fetch(); out.forEach(System.out::println); @@ -80,7 +81,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add) .fetch(); out.forEach(System.out::println); @@ -94,7 +95,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add) .fetch(); out.forEach(System.out::println); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 147aca68..e7742445 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.io.CrawledDomainReader; @@ -53,7 +54,7 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); var fetchedUrls = data.stream().filter(CrawledDocument.class::isInstance) @@ -82,7 +83,7 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); data.stream().filter(CrawledDocument.class::isInstance) .map(CrawledDocument.class::cast) @@ -118,7 +119,7 @@ class CrawlerRetreiverTest { var writer = new CrawledDomainWriter(out, specs.domain, "idid"); Map, List> data = new HashMap<>(); - new CrawlerRetreiver(httpFetcher, specs, d -> { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> { data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); @@ -136,7 +137,7 @@ class CrawlerRetreiverTest { CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); - new CrawlerRetreiver(httpFetcher, specs, d -> { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> { if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); } diff --git a/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb b/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb index f911d3db..88b6ad84 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb @@ -99,7 +99,6 @@ + diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index e2c792fb..8609903d 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -33,6 +33,7 @@ dependencies { implementation project(':code:features-search:screenshots') + implementation project(':code:libraries:geo-ip') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java index 4da309dc..690509db 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java @@ -1,9 +1,8 @@ package nu.marginalia.assistant.domains; import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; +import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.db.DbDomainQueries; import nu.marginalia.assistant.client.model.DomainInformation; import org.slf4j.Logger; diff --git a/settings.gradle b/settings.gradle index 952acd9c..342107de 100644 --- a/settings.gradle +++ b/settings.gradle @@ -12,6 +12,7 @@ include 'code:services-application:dating-service' include 'code:services-application:explorer-service' include 'code:libraries:array' +include 'code:libraries:geo-ip' include 'code:libraries:btree' include 'code:libraries:easy-lsh' include 'code:libraries:guarded-regex'