(*) Rename EdgeDomain$domain into topDomain

This variable had a very confusing name, and was dangerously easy to use in the wrong place with the result of getting something that only works as expected half the time.

Ideally this class needs an overhaul, the assumptions it makes about domain names aren't great.
This commit is contained in:
Viktor Lofgren 2023-12-17 14:00:07 +01:00
parent edf9aa2c23
commit bf44805e69
14 changed files with 53 additions and 55 deletions

View File

@ -15,7 +15,7 @@ public class EdgeDomain implements Serializable {
@Nonnull @Nonnull
public final String subDomain; public final String subDomain;
@Nonnull @Nonnull
public final String domain; public final String topDomain;
@SneakyThrows @SneakyThrows
public EdgeDomain(String host) { public EdgeDomain(String host) {
@ -27,13 +27,13 @@ public class EdgeDomain implements Serializable {
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.> if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
subDomain = ""; subDomain = "";
domain = host; topDomain = host;
} }
else { else {
int dot2 = host.substring(0, dot).lastIndexOf('.'); int dot2 = host.substring(0, dot).lastIndexOf('.');
if (dot2 < 0) { if (dot2 < 0) {
subDomain = ""; subDomain = "";
domain = host; topDomain = host;
} }
else { else {
if (looksLikeGovTld(host)) if (looksLikeGovTld(host))
@ -42,16 +42,16 @@ public class EdgeDomain implements Serializable {
if (dot3 >= 0) { if (dot3 >= 0) {
dot2 = dot3; dot2 = dot3;
subDomain = host.substring(0, dot2); subDomain = host.substring(0, dot2);
domain = host.substring(dot2 + 1); topDomain = host.substring(dot2 + 1);
} }
else { else {
subDomain = ""; subDomain = "";
domain = host; topDomain = host;
} }
} }
else { else {
subDomain = host.substring(0, dot2); subDomain = host.substring(0, dot2);
domain = host.substring(dot2 + 1); topDomain = host.substring(dot2 + 1);
} }
} }
} }
@ -97,28 +97,28 @@ public class EdgeDomain implements Serializable {
public String getAddress() { public String getAddress() {
if (!subDomain.isEmpty()) { if (!subDomain.isEmpty()) {
return subDomain + "." + domain; return subDomain + "." + topDomain;
} }
return domain; return topDomain;
} }
public String getDomainKey() { public String getDomainKey() {
int cutPoint = domain.indexOf('.'); int cutPoint = topDomain.indexOf('.');
if (cutPoint < 0) { if (cutPoint < 0) {
return domain; return topDomain;
} }
return domain.substring(0, cutPoint).toLowerCase(); return topDomain.substring(0, cutPoint).toLowerCase();
} }
public String getLongDomainKey() { public String getLongDomainKey() {
StringBuilder ret = new StringBuilder(); StringBuilder ret = new StringBuilder();
int cutPoint = domain.indexOf('.'); int cutPoint = topDomain.indexOf('.');
if (cutPoint < 0) { if (cutPoint < 0) {
ret.append(domain); ret.append(topDomain);
} }
else { else {
ret.append(domain, 0, cutPoint); ret.append(topDomain, 0, cutPoint);
} }
if (!"".equals(subDomain) && !"www".equals(subDomain)) { if (!"".equals(subDomain) && !"www".equals(subDomain)) {
@ -133,30 +133,30 @@ public class EdgeDomain implements Serializable {
public boolean hasSameTopDomain(EdgeDomain other) { public boolean hasSameTopDomain(EdgeDomain other) {
if (other == null) return false; if (other == null) return false;
return domain.equalsIgnoreCase(other.domain); return topDomain.equalsIgnoreCase(other.topDomain);
} }
public String getTld() { public String getTld() {
int dot = -1; int dot = -1;
int length = domain.length(); int length = topDomain.length();
if (ipPatternTest.test(domain)) { if (ipPatternTest.test(topDomain)) {
return "IP"; return "IP";
} }
if (govListTest.test(domain)) { if (govListTest.test(topDomain)) {
dot = domain.indexOf('.', Math.max(0, length - ".edu.uk".length())); dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
} }
else { else {
dot = domain.lastIndexOf('.'); dot = topDomain.lastIndexOf('.');
} }
if (dot < 0 || dot == domain.length() - 1) { if (dot < 0 || dot == topDomain.length() - 1) {
return "-"; return "-";
} }
else { else {
return domain.substring(dot + 1); return topDomain.substring(dot + 1);
} }
} }
@ -166,8 +166,8 @@ public class EdgeDomain implements Serializable {
final String this$subDomain = this.getSubDomain(); final String this$subDomain = this.getSubDomain();
final String other$subDomain = other.getSubDomain(); final String other$subDomain = other.getSubDomain();
if (!Objects.equals(this$subDomain,other$subDomain)) return false; if (!Objects.equals(this$subDomain,other$subDomain)) return false;
final String this$domain = this.getDomain(); final String this$domain = this.getTopDomain();
final String other$domain = other.getDomain(); final String other$domain = other.getTopDomain();
if (!Objects.equals(this$domain,other$domain)) return false; if (!Objects.equals(this$domain,other$domain)) return false;
return true; return true;
} }
@ -177,7 +177,7 @@ public class EdgeDomain implements Serializable {
int result = 1; int result = 1;
final Object $subDomain = this.getSubDomain().toLowerCase(); final Object $subDomain = this.getSubDomain().toLowerCase();
result = result * PRIME + $subDomain.hashCode(); result = result * PRIME + $subDomain.hashCode();
final Object $domain = this.getDomain().toLowerCase(); final Object $domain = this.getTopDomain().toLowerCase();
result = result * PRIME + $domain.hashCode(); result = result * PRIME + $domain.hashCode();
return result; return result;
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.model; package nu.marginalia.model;
import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.net.URISyntaxException; import java.net.URISyntaxException;
@ -22,7 +21,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://l7072i3.l7c.net"); var domain = new EdgeUrl("http://l7072i3.l7c.net");
assertEquals("http", domain.proto); assertEquals("http", domain.proto);
assertEquals("l7072i3", domain.domain.subDomain); assertEquals("l7072i3", domain.domain.subDomain);
assertEquals("l7c.net", domain.domain.domain); assertEquals("l7c.net", domain.domain.topDomain);
assertEquals("net", domain.domain.getTld()); assertEquals("net", domain.domain.getTld());
} }
@ -31,7 +30,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://endless.horse/"); var domain = new EdgeUrl("http://endless.horse/");
assertEquals("http", domain.proto); assertEquals("http", domain.proto);
assertEquals("", domain.domain.subDomain); assertEquals("", domain.domain.subDomain);
assertEquals("endless.horse", domain.domain.domain); assertEquals("endless.horse", domain.domain.topDomain);
assertEquals("horse", domain.domain.getTld()); assertEquals("horse", domain.domain.getTld());
} }
@ -40,7 +39,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://uj.edu.pl"); var domain = new EdgeUrl("http://uj.edu.pl");
assertEquals("http", domain.proto); assertEquals("http", domain.proto);
assertEquals("", domain.domain.subDomain); assertEquals("", domain.domain.subDomain);
assertEquals("uj.edu.pl", domain.domain.domain); assertEquals("uj.edu.pl", domain.domain.topDomain);
assertEquals("edu.pl", domain.domain.getTld()); assertEquals("edu.pl", domain.domain.getTld());
} }
@ -50,7 +49,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://www.marginalia.nu"); var domain = new EdgeUrl("http://www.marginalia.nu");
assertEquals("http", domain.proto); assertEquals("http", domain.proto);
assertEquals("www", domain.domain.subDomain); assertEquals("www", domain.domain.subDomain);
assertEquals("marginalia.nu", domain.domain.domain); assertEquals("marginalia.nu", domain.domain.topDomain);
assertEquals("http://www.marginalia.nu/", domain.toString()); assertEquals("http://www.marginalia.nu/", domain.toString());
assertEquals("nu", domain.domain.getTld()); assertEquals("nu", domain.domain.getTld());
} }
@ -58,7 +57,7 @@ class EdgeDomainTest {
@Test @Test
public void testUkDomain2() throws URISyntaxException { public void testUkDomain2() throws URISyntaxException {
var domain = new EdgeUrl("http://marginalia.co.uk"); var domain = new EdgeUrl("http://marginalia.co.uk");
assertEquals("marginalia.co.uk", domain.domain.domain); assertEquals("marginalia.co.uk", domain.domain.topDomain);
assertEquals("http", domain.proto); assertEquals("http", domain.proto);
assertEquals("", domain.domain.subDomain); assertEquals("", domain.domain.subDomain);
assertEquals("http://marginalia.co.uk/", domain.toString()); assertEquals("http://marginalia.co.uk/", domain.toString());
@ -68,7 +67,7 @@ class EdgeDomainTest {
@Test @Test
public void testUkDomain3() throws URISyntaxException { public void testUkDomain3() throws URISyntaxException {
var domain = new EdgeUrl("http://withcandour.co.uk"); var domain = new EdgeUrl("http://withcandour.co.uk");
assertEquals("withcandour.co.uk", domain.domain.domain); assertEquals("withcandour.co.uk", domain.domain.topDomain);
assertEquals("http", domain.proto); assertEquals("http", domain.proto);
assertEquals("", domain.domain.subDomain); assertEquals("", domain.domain.subDomain);
assertEquals("http://withcandour.co.uk/", domain.toString()); assertEquals("http://withcandour.co.uk/", domain.toString());
@ -80,7 +79,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://www.marginalia.co.uk"); var domain = new EdgeUrl("http://www.marginalia.co.uk");
assertEquals("http", domain.proto); assertEquals("http", domain.proto);
assertEquals("www", domain.domain.subDomain); assertEquals("www", domain.domain.subDomain);
assertEquals("marginalia.co.uk", domain.domain.domain); assertEquals("marginalia.co.uk", domain.domain.topDomain);
assertEquals("http://www.marginalia.co.uk/", domain.toString()); assertEquals("http://www.marginalia.co.uk/", domain.toString());
} }
@ -88,7 +87,7 @@ class EdgeDomainTest {
public void testThreeLetterDomain() throws URISyntaxException { public void testThreeLetterDomain() throws URISyntaxException {
var domain = new EdgeUrl("http://www.marginalia.abcf.de"); var domain = new EdgeUrl("http://www.marginalia.abcf.de");
assertEquals("http", domain.proto); assertEquals("http", domain.proto);
assertEquals("abcf.de", domain.domain.domain); assertEquals("abcf.de", domain.domain.topDomain);
assertEquals("www.marginalia", domain.domain.subDomain); assertEquals("www.marginalia", domain.domain.subDomain);
assertEquals("de", domain.domain.getTld()); assertEquals("de", domain.domain.getTld());
} }
@ -98,7 +97,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://marginalia.nu"); var domain = new EdgeUrl("http://marginalia.nu");
assertEquals("http", domain.proto); assertEquals("http", domain.proto);
assertEquals("", domain.domain.subDomain); assertEquals("", domain.domain.subDomain);
assertEquals("marginalia.nu", domain.domain.domain); assertEquals("marginalia.nu", domain.domain.topDomain);
assertEquals("http://marginalia.nu/", domain.toString()); assertEquals("http://marginalia.nu/", domain.toString());
assertEquals("nu", domain.domain.getTld()); assertEquals("nu", domain.domain.getTld());
} }
@ -108,7 +107,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("https://127.0.0.1:8080"); var domain = new EdgeUrl("https://127.0.0.1:8080");
assertEquals("https", domain.proto); assertEquals("https", domain.proto);
assertEquals("", domain.domain.subDomain); assertEquals("", domain.domain.subDomain);
assertEquals("127.0.0.1", domain.domain.domain); assertEquals("127.0.0.1", domain.domain.topDomain);
assertEquals("https://127.0.0.1:8080/", domain.toString()); assertEquals("https://127.0.0.1:8080/", domain.toString());
assertEquals("IP", domain.domain.getTld()); assertEquals("IP", domain.domain.getTld());
} }
@ -118,7 +117,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("https://192.168.1.32"); var domain = new EdgeUrl("https://192.168.1.32");
assertEquals("https", domain.proto); assertEquals("https", domain.proto);
assertEquals("", domain.domain.subDomain); assertEquals("", domain.domain.subDomain);
assertEquals("192.168.1.32", domain.domain.domain); assertEquals("192.168.1.32", domain.domain.topDomain);
assertEquals("https://192.168.1.32/", domain.toString()); assertEquals("https://192.168.1.32/", domain.toString());
assertEquals("IP", domain.domain.getTld()); assertEquals("IP", domain.domain.getTld());
} }

View File

@ -62,7 +62,7 @@ public class IpBlockList {
if (blocklistDisabled) if (blocklistDisabled)
return true; return true;
if (domain.domain.endsWith(".cn")) { if (domain.topDomain.endsWith(".cn")) {
logger.debug("Blocking {} on .cn-end", domain); logger.debug("Blocking {} on .cn-end", domain);
return false; return false;
} }

View File

@ -67,7 +67,7 @@ public class UrlBlocklist {
public boolean isUrlBlocked(EdgeUrl url) { public boolean isUrlBlocked(EdgeUrl url) {
try { try {
if (badDomains.contains(url.domain.domain)) { if (badDomains.contains(url.domain.topDomain)) {
return true; return true;
} }
@ -76,7 +76,7 @@ public class UrlBlocklist {
return true; return true;
} }
if ("github.com".equals(url.domain.domain)) { if ("github.com".equals(url.domain.topDomain)) {
return url.path.chars().filter(c -> c == '/').count() > 2; return url.path.chars().filter(c -> c == '/').count() > 2;
} }

View File

@ -10,7 +10,7 @@ public record BrowseResult (EdgeUrl url,
public String domainHash() { public String domainHash() {
var domain = url.domain; var domain = url.domain;
if ("www".equals(domain.subDomain)) { if ("www".equals(domain.subDomain)) {
return domain.domain; return domain.topDomain;
} }
return domain.toString(); return domain.toString();
} }
@ -19,7 +19,7 @@ public record BrowseResult (EdgeUrl url,
String ret; String ret;
var domain = url.domain; var domain = url.domain;
if ("www".equals(domain.subDomain)) { if ("www".equals(domain.subDomain)) {
ret = domain.domain; ret = domain.topDomain;
} }
else { else {
ret = domain.toString(); ret = domain.toString();

View File

@ -14,7 +14,6 @@ import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.links.TopKeywords;
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
@ -161,10 +160,10 @@ public class DomainProcessor {
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$"); private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
private boolean isAcademicDomain(EdgeDomain domain) { private boolean isAcademicDomain(EdgeDomain domain) {
if (domain.domain.endsWith(".edu")) if (domain.topDomain.endsWith(".edu"))
return true; return true;
if (academicPattern.matcher(domain.domain).matches()) if (academicPattern.matcher(domain.topDomain).matches())
return true; return true;
return false; return false;

View File

@ -50,7 +50,7 @@ public abstract class AbstractDocumentProcessorPlugin {
public MetaTagsBuilder addUrl(EdgeUrl url) { public MetaTagsBuilder addUrl(EdgeUrl url) {
add("proto", url.proto); add("proto", url.proto);
add("site", url.domain); add("site", url.domain);
add("site", url.domain.domain); add("site", url.domain.topDomain);
add("tld", url.domain.getTld()); add("tld", url.domain.getTld());
if (url.path.startsWith("/~")) { if (url.path.startsWith("/~")) {

View File

@ -291,7 +291,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
for (var fd : lp.getForeignDomains()) { for (var fd : lp.getForeignDomains()) {
linkTerms.add("links:"+fd.toString().toLowerCase()); linkTerms.add("links:"+fd.toString().toLowerCase());
linkTerms.add("links:"+fd.getDomain().toLowerCase()); linkTerms.add("links:"+fd.getTopDomain().toLowerCase());
} }
return linkTerms; return linkTerms;

View File

@ -54,7 +54,7 @@ public class HtmlProcessorSpecializations {
return blogSpecialization; return blogSpecialization;
} }
if (url.domain.getDomain().equals("mariadb.com") if (url.domain.getTopDomain().equals("mariadb.com")
&& url.path.startsWith("/kb")) { && url.path.startsWith("/kb")) {
return mariadbKbSpecialization; return mariadbKbSpecialization;
} }

View File

@ -33,7 +33,7 @@ class RssCrawlerTest {
var href = element.attr("href"); var href = element.attr("href");
if (href != null && !href.isBlank()) { if (href != null && !href.isBlank()) {
lp.parseLink(base, href) lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain)) .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
.ifPresent(urls::add); .ifPresent(urls::add);
} }
}); });
@ -42,7 +42,7 @@ class RssCrawlerTest {
var href = element.text(); var href = element.text();
if (href != null && !href.isBlank()) { if (href != null && !href.isBlank()) {
lp.parseLink(base, href) lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain)) .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
.ifPresent(urls::add); .ifPresent(urls::add);
} }
}); });
@ -51,7 +51,7 @@ class RssCrawlerTest {
var href = element.text(); var href = element.text();
if (href != null && !href.isBlank()) { if (href != null && !href.isBlank()) {
lp.parseLink(base, href) lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain)) .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
.ifPresent(urls::add); .ifPresent(urls::add);
} }
}); });

View File

@ -147,7 +147,7 @@ public class DomainLoaderService {
public void accept(EdgeDomain domain) throws SQLException { public void accept(EdgeDomain domain) throws SQLException {
statement.setString(1, domain.toString()); statement.setString(1, domain.toString());
statement.setString(2, domain.domain); statement.setString(2, domain.topDomain);
statement.setInt(3, nodeAffinity); statement.setInt(3, nodeAffinity);
statement.addBatch(); statement.addBatch();

View File

@ -81,7 +81,7 @@ public class ControlBlacklistService {
""")) { """)) {
stmt.setString(1, domain.toString()); stmt.setString(1, domain.toString());
stmt.addBatch(); stmt.addBatch();
stmt.setString(1, domain.domain); stmt.setString(1, domain.topDomain);
stmt.addBatch(); stmt.addBatch();
stmt.executeBatch(); stmt.executeBatch();
} }

View File

@ -157,7 +157,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
return false; return false;
// This is an artifact of the link parser typically // This is an artifact of the link parser typically
if ("example.com".equals(url.domain.domain)) if ("example.com".equals(url.domain.topDomain))
return false; return false;
if (linkText.contains(url.domain.toString())) if (linkText.contains(url.domain.toString()))

View File

@ -61,7 +61,7 @@ public class DomainListRefreshService {
for (var domain : domainsAll) { for (var domain : domainsAll) {
var parsed = new EdgeDomain(domain); var parsed = new EdgeDomain(domain);
insert.setString(1, domain.toLowerCase()); insert.setString(1, domain.toLowerCase());
insert.setString(2, parsed.domain); insert.setString(2, parsed.topDomain);
insert.setInt(3, nodeId); insert.setInt(3, nodeId);
insert.addBatch(); insert.addBatch();
} }