diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java index 54038b6a..d60f3571 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java @@ -15,7 +15,7 @@ public class EdgeDomain implements Serializable { @Nonnull public final String subDomain; @Nonnull - public final String domain; + public final String topDomain; @SneakyThrows public EdgeDomain(String host) { @@ -27,13 +27,13 @@ public class EdgeDomain implements Serializable { if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.> subDomain = ""; - domain = host; + topDomain = host; } else { int dot2 = host.substring(0, dot).lastIndexOf('.'); if (dot2 < 0) { subDomain = ""; - domain = host; + topDomain = host; } else { if (looksLikeGovTld(host)) @@ -42,16 +42,16 @@ public class EdgeDomain implements Serializable { if (dot3 >= 0) { dot2 = dot3; subDomain = host.substring(0, dot2); - domain = host.substring(dot2 + 1); + topDomain = host.substring(dot2 + 1); } else { subDomain = ""; - domain = host; + topDomain = host; } } else { subDomain = host.substring(0, dot2); - domain = host.substring(dot2 + 1); + topDomain = host.substring(dot2 + 1); } } } @@ -97,28 +97,28 @@ public class EdgeDomain implements Serializable { public String getAddress() { if (!subDomain.isEmpty()) { - return subDomain + "." + domain; + return subDomain + "." + topDomain; } - return domain; + return topDomain; } public String getDomainKey() { - int cutPoint = domain.indexOf('.'); + int cutPoint = topDomain.indexOf('.'); if (cutPoint < 0) { - return domain; + return topDomain; } - return domain.substring(0, cutPoint).toLowerCase(); + return topDomain.substring(0, cutPoint).toLowerCase(); } public String getLongDomainKey() { StringBuilder ret = new StringBuilder(); - int cutPoint = domain.indexOf('.'); + int cutPoint = topDomain.indexOf('.'); if (cutPoint < 0) { - ret.append(domain); + ret.append(topDomain); } else { - ret.append(domain, 0, cutPoint); + ret.append(topDomain, 0, cutPoint); } if (!"".equals(subDomain) && !"www".equals(subDomain)) { @@ -133,30 +133,30 @@ public class EdgeDomain implements Serializable { public boolean hasSameTopDomain(EdgeDomain other) { if (other == null) return false; - return domain.equalsIgnoreCase(other.domain); + return topDomain.equalsIgnoreCase(other.topDomain); } public String getTld() { int dot = -1; - int length = domain.length(); + int length = topDomain.length(); - if (ipPatternTest.test(domain)) { + if (ipPatternTest.test(topDomain)) { return "IP"; } - if (govListTest.test(domain)) { - dot = domain.indexOf('.', Math.max(0, length - ".edu.uk".length())); + if (govListTest.test(topDomain)) { + dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length())); } else { - dot = domain.lastIndexOf('.'); + dot = topDomain.lastIndexOf('.'); } - if (dot < 0 || dot == domain.length() - 1) { + if (dot < 0 || dot == topDomain.length() - 1) { return "-"; } else { - return domain.substring(dot + 1); + return topDomain.substring(dot + 1); } } @@ -166,8 +166,8 @@ public class EdgeDomain implements Serializable { final String this$subDomain = this.getSubDomain(); final String other$subDomain = other.getSubDomain(); if (!Objects.equals(this$subDomain,other$subDomain)) return false; - final String this$domain = this.getDomain(); - final String other$domain = other.getDomain(); + final String this$domain = this.getTopDomain(); + final String other$domain = other.getTopDomain(); if (!Objects.equals(this$domain,other$domain)) return false; return true; } @@ -177,7 +177,7 @@ public class EdgeDomain implements Serializable { int result = 1; final Object $subDomain = this.getSubDomain().toLowerCase(); result = result * PRIME + $subDomain.hashCode(); - final Object $domain = this.getDomain().toLowerCase(); + final Object $domain = this.getTopDomain().toLowerCase(); result = result * PRIME + $domain.hashCode(); return result; } diff --git a/code/common/model/src/test/java/nu/marginalia/model/EdgeDomainTest.java b/code/common/model/src/test/java/nu/marginalia/model/EdgeDomainTest.java index 9fbf6890..ad41b884 100644 --- a/code/common/model/src/test/java/nu/marginalia/model/EdgeDomainTest.java +++ b/code/common/model/src/test/java/nu/marginalia/model/EdgeDomainTest.java @@ -1,6 +1,5 @@ package nu.marginalia.model; -import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Test; import java.net.URISyntaxException; @@ -22,7 +21,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://l7072i3.l7c.net"); assertEquals("http", domain.proto); assertEquals("l7072i3", domain.domain.subDomain); - assertEquals("l7c.net", domain.domain.domain); + assertEquals("l7c.net", domain.domain.topDomain); assertEquals("net", domain.domain.getTld()); } @@ -31,7 +30,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://endless.horse/"); assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); - assertEquals("endless.horse", domain.domain.domain); + assertEquals("endless.horse", domain.domain.topDomain); assertEquals("horse", domain.domain.getTld()); } @@ -40,7 +39,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://uj.edu.pl"); assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); - assertEquals("uj.edu.pl", domain.domain.domain); + assertEquals("uj.edu.pl", domain.domain.topDomain); assertEquals("edu.pl", domain.domain.getTld()); } @@ -50,7 +49,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://www.marginalia.nu"); assertEquals("http", domain.proto); assertEquals("www", domain.domain.subDomain); - assertEquals("marginalia.nu", domain.domain.domain); + assertEquals("marginalia.nu", domain.domain.topDomain); assertEquals("http://www.marginalia.nu/", domain.toString()); assertEquals("nu", domain.domain.getTld()); } @@ -58,7 +57,7 @@ class EdgeDomainTest { @Test public void testUkDomain2() throws URISyntaxException { var domain = new EdgeUrl("http://marginalia.co.uk"); - assertEquals("marginalia.co.uk", domain.domain.domain); + assertEquals("marginalia.co.uk", domain.domain.topDomain); assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); assertEquals("http://marginalia.co.uk/", domain.toString()); @@ -68,7 +67,7 @@ class EdgeDomainTest { @Test public void testUkDomain3() throws URISyntaxException { var domain = new EdgeUrl("http://withcandour.co.uk"); - assertEquals("withcandour.co.uk", domain.domain.domain); + assertEquals("withcandour.co.uk", domain.domain.topDomain); assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); assertEquals("http://withcandour.co.uk/", domain.toString()); @@ -80,7 +79,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://www.marginalia.co.uk"); assertEquals("http", domain.proto); assertEquals("www", domain.domain.subDomain); - assertEquals("marginalia.co.uk", domain.domain.domain); + assertEquals("marginalia.co.uk", domain.domain.topDomain); assertEquals("http://www.marginalia.co.uk/", domain.toString()); } @@ -88,7 +87,7 @@ class EdgeDomainTest { public void testThreeLetterDomain() throws URISyntaxException { var domain = new EdgeUrl("http://www.marginalia.abcf.de"); assertEquals("http", domain.proto); - assertEquals("abcf.de", domain.domain.domain); + assertEquals("abcf.de", domain.domain.topDomain); assertEquals("www.marginalia", domain.domain.subDomain); assertEquals("de", domain.domain.getTld()); } @@ -98,7 +97,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://marginalia.nu"); assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); - assertEquals("marginalia.nu", domain.domain.domain); + assertEquals("marginalia.nu", domain.domain.topDomain); assertEquals("http://marginalia.nu/", domain.toString()); assertEquals("nu", domain.domain.getTld()); } @@ -108,7 +107,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("https://127.0.0.1:8080"); assertEquals("https", domain.proto); assertEquals("", domain.domain.subDomain); - assertEquals("127.0.0.1", domain.domain.domain); + assertEquals("127.0.0.1", domain.domain.topDomain); assertEquals("https://127.0.0.1:8080/", domain.toString()); assertEquals("IP", domain.domain.getTld()); } @@ -118,7 +117,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("https://192.168.1.32"); assertEquals("https", domain.proto); assertEquals("", domain.domain.subDomain); - assertEquals("192.168.1.32", domain.domain.domain); + assertEquals("192.168.1.32", domain.domain.topDomain); assertEquals("https://192.168.1.32/", domain.toString()); assertEquals("IP", domain.domain.getTld()); } diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java index a339b1d4..e1b5beee 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java @@ -62,7 +62,7 @@ public class IpBlockList { if (blocklistDisabled) return true; - if (domain.domain.endsWith(".cn")) { + if (domain.topDomain.endsWith(".cn")) { logger.debug("Blocking {} on .cn-end", domain); return false; } diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java index f3574b87..dbd95d61 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java @@ -67,7 +67,7 @@ public class UrlBlocklist { public boolean isUrlBlocked(EdgeUrl url) { try { - if (badDomains.contains(url.domain.domain)) { + if (badDomains.contains(url.domain.topDomain)) { return true; } @@ -76,7 +76,7 @@ public class UrlBlocklist { return true; } - if ("github.com".equals(url.domain.domain)) { + if ("github.com".equals(url.domain.topDomain)) { return url.path.chars().filter(c -> c == '/').count() > 2; } diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java index e4f5460b..bfca3fb5 100644 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java +++ b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java @@ -10,7 +10,7 @@ public record BrowseResult (EdgeUrl url, public String domainHash() { var domain = url.domain; if ("www".equals(domain.subDomain)) { - return domain.domain; + return domain.topDomain; } return domain.toString(); } @@ -19,7 +19,7 @@ public record BrowseResult (EdgeUrl url, String ret; var domain = url.domain; if ("www".equals(domain.subDomain)) { - ret = domain.domain; + ret = domain.topDomain; } else { ret = domain.toString(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index e9794aad..2f838cb5 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -14,7 +14,6 @@ import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import nu.marginalia.model.crawl.HtmlFeature; @@ -161,10 +160,10 @@ public class DomainProcessor { private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$"); private boolean isAcademicDomain(EdgeDomain domain) { - if (domain.domain.endsWith(".edu")) + if (domain.topDomain.endsWith(".edu")) return true; - if (academicPattern.matcher(domain.domain).matches()) + if (academicPattern.matcher(domain.topDomain).matches()) return true; return false; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index 0007eeb6..a92a4af7 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -50,7 +50,7 @@ public abstract class AbstractDocumentProcessorPlugin { public MetaTagsBuilder addUrl(EdgeUrl url) { add("proto", url.proto); add("site", url.domain); - add("site", url.domain.domain); + add("site", url.domain.topDomain); add("tld", url.domain.getTld()); if (url.path.startsWith("/~")) { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 4017778e..7d973909 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -291,7 +291,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin for (var fd : lp.getForeignDomains()) { linkTerms.add("links:"+fd.toString().toLowerCase()); - linkTerms.add("links:"+fd.getDomain().toLowerCase()); + linkTerms.add("links:"+fd.getTopDomain().toLowerCase()); } return linkTerms; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java index c587ce4b..80b063e8 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java @@ -54,7 +54,7 @@ public class HtmlProcessorSpecializations { return blogSpecialization; } - if (url.domain.getDomain().equals("mariadb.com") + if (url.domain.getTopDomain().equals("mariadb.com") && url.path.startsWith("/kb")) { return mariadbKbSpecialization; } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java index 05de76dc..a4d0ee92 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java @@ -33,7 +33,7 @@ class RssCrawlerTest { var href = element.attr("href"); if (href != null && !href.isBlank()) { lp.parseLink(base, href) - .filter(u -> Objects.equals(u.domain.domain, base.domain.domain)) + .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain)) .ifPresent(urls::add); } }); @@ -42,7 +42,7 @@ class RssCrawlerTest { var href = element.text(); if (href != null && !href.isBlank()) { lp.parseLink(base, href) - .filter(u -> Objects.equals(u.domain.domain, base.domain.domain)) + .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain)) .ifPresent(urls::add); } }); @@ -51,7 +51,7 @@ class RssCrawlerTest { var href = element.text(); if (href != null && !href.isBlank()) { lp.parseLink(base, href) - .filter(u -> Objects.equals(u.domain.domain, base.domain.domain)) + .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain)) .ifPresent(urls::add); } }); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java index 911c976d..7125eb30 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java @@ -147,7 +147,7 @@ public class DomainLoaderService { public void accept(EdgeDomain domain) throws SQLException { statement.setString(1, domain.toString()); - statement.setString(2, domain.domain); + statement.setString(2, domain.topDomain); statement.setInt(3, nodeAffinity); statement.addBatch(); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/app/svc/ControlBlacklistService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/app/svc/ControlBlacklistService.java index 9181f325..3a1ad44d 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/app/svc/ControlBlacklistService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/app/svc/ControlBlacklistService.java @@ -81,7 +81,7 @@ public class ControlBlacklistService { """)) { stmt.setString(1, domain.toString()); stmt.addBatch(); - stmt.setString(1, domain.domain); + stmt.setString(1, domain.topDomain); stmt.addBatch(); stmt.executeBatch(); } diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java index 353ef965..35ddde89 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java @@ -157,7 +157,7 @@ public class ExportAtagsActor extends RecordActorPrototype { return false; // This is an artifact of the link parser typically - if ("example.com".equals(url.domain.domain)) + if ("example.com".equals(url.domain.topDomain)) return false; if (linkText.contains(url.domain.toString())) diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/DomainListRefreshService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/DomainListRefreshService.java index fe600fc0..a25bb92f 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/DomainListRefreshService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/DomainListRefreshService.java @@ -61,7 +61,7 @@ public class DomainListRefreshService { for (var domain : domainsAll) { var parsed = new EdgeDomain(domain); insert.setString(1, domain.toLowerCase()); - insert.setString(2, parsed.domain); + insert.setString(2, parsed.topDomain); insert.setInt(3, nodeId); insert.addBatch(); }