(*) Rename EdgeDomain$domain into topDomain

This variable had a very confusing name, and was dangerously easy to use in the wrong place with the result of getting something that only works as expected half the time.

Ideally this class needs an overhaul, the assumptions it makes about domain names aren't great.
This commit is contained in:
Viktor Lofgren 2023-12-17 14:00:07 +01:00
parent edf9aa2c23
commit bf44805e69
14 changed files with 53 additions and 55 deletions

View File

@ -15,7 +15,7 @@ public class EdgeDomain implements Serializable {
@Nonnull
public final String subDomain;
@Nonnull
public final String domain;
public final String topDomain;
@SneakyThrows
public EdgeDomain(String host) {
@ -27,13 +27,13 @@ public class EdgeDomain implements Serializable {
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
subDomain = "";
domain = host;
topDomain = host;
}
else {
int dot2 = host.substring(0, dot).lastIndexOf('.');
if (dot2 < 0) {
subDomain = "";
domain = host;
topDomain = host;
}
else {
if (looksLikeGovTld(host))
@ -42,16 +42,16 @@ public class EdgeDomain implements Serializable {
if (dot3 >= 0) {
dot2 = dot3;
subDomain = host.substring(0, dot2);
domain = host.substring(dot2 + 1);
topDomain = host.substring(dot2 + 1);
}
else {
subDomain = "";
domain = host;
topDomain = host;
}
}
else {
subDomain = host.substring(0, dot2);
domain = host.substring(dot2 + 1);
topDomain = host.substring(dot2 + 1);
}
}
}
@ -97,28 +97,28 @@ public class EdgeDomain implements Serializable {
public String getAddress() {
if (!subDomain.isEmpty()) {
return subDomain + "." + domain;
return subDomain + "." + topDomain;
}
return domain;
return topDomain;
}
public String getDomainKey() {
int cutPoint = domain.indexOf('.');
int cutPoint = topDomain.indexOf('.');
if (cutPoint < 0) {
return domain;
return topDomain;
}
return domain.substring(0, cutPoint).toLowerCase();
return topDomain.substring(0, cutPoint).toLowerCase();
}
public String getLongDomainKey() {
StringBuilder ret = new StringBuilder();
int cutPoint = domain.indexOf('.');
int cutPoint = topDomain.indexOf('.');
if (cutPoint < 0) {
ret.append(domain);
ret.append(topDomain);
}
else {
ret.append(domain, 0, cutPoint);
ret.append(topDomain, 0, cutPoint);
}
if (!"".equals(subDomain) && !"www".equals(subDomain)) {
@ -133,30 +133,30 @@ public class EdgeDomain implements Serializable {
public boolean hasSameTopDomain(EdgeDomain other) {
if (other == null) return false;
return domain.equalsIgnoreCase(other.domain);
return topDomain.equalsIgnoreCase(other.topDomain);
}
public String getTld() {
int dot = -1;
int length = domain.length();
int length = topDomain.length();
if (ipPatternTest.test(domain)) {
if (ipPatternTest.test(topDomain)) {
return "IP";
}
if (govListTest.test(domain)) {
dot = domain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
if (govListTest.test(topDomain)) {
dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
}
else {
dot = domain.lastIndexOf('.');
dot = topDomain.lastIndexOf('.');
}
if (dot < 0 || dot == domain.length() - 1) {
if (dot < 0 || dot == topDomain.length() - 1) {
return "-";
}
else {
return domain.substring(dot + 1);
return topDomain.substring(dot + 1);
}
}
@ -166,8 +166,8 @@ public class EdgeDomain implements Serializable {
final String this$subDomain = this.getSubDomain();
final String other$subDomain = other.getSubDomain();
if (!Objects.equals(this$subDomain,other$subDomain)) return false;
final String this$domain = this.getDomain();
final String other$domain = other.getDomain();
final String this$domain = this.getTopDomain();
final String other$domain = other.getTopDomain();
if (!Objects.equals(this$domain,other$domain)) return false;
return true;
}
@ -177,7 +177,7 @@ public class EdgeDomain implements Serializable {
int result = 1;
final Object $subDomain = this.getSubDomain().toLowerCase();
result = result * PRIME + $subDomain.hashCode();
final Object $domain = this.getDomain().toLowerCase();
final Object $domain = this.getTopDomain().toLowerCase();
result = result * PRIME + $domain.hashCode();
return result;
}

View File

@ -1,6 +1,5 @@
package nu.marginalia.model;
import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.Test;
import java.net.URISyntaxException;
@ -22,7 +21,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://l7072i3.l7c.net");
assertEquals("http", domain.proto);
assertEquals("l7072i3", domain.domain.subDomain);
assertEquals("l7c.net", domain.domain.domain);
assertEquals("l7c.net", domain.domain.topDomain);
assertEquals("net", domain.domain.getTld());
}
@ -31,7 +30,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://endless.horse/");
assertEquals("http", domain.proto);
assertEquals("", domain.domain.subDomain);
assertEquals("endless.horse", domain.domain.domain);
assertEquals("endless.horse", domain.domain.topDomain);
assertEquals("horse", domain.domain.getTld());
}
@ -40,7 +39,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://uj.edu.pl");
assertEquals("http", domain.proto);
assertEquals("", domain.domain.subDomain);
assertEquals("uj.edu.pl", domain.domain.domain);
assertEquals("uj.edu.pl", domain.domain.topDomain);
assertEquals("edu.pl", domain.domain.getTld());
}
@ -50,7 +49,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://www.marginalia.nu");
assertEquals("http", domain.proto);
assertEquals("www", domain.domain.subDomain);
assertEquals("marginalia.nu", domain.domain.domain);
assertEquals("marginalia.nu", domain.domain.topDomain);
assertEquals("http://www.marginalia.nu/", domain.toString());
assertEquals("nu", domain.domain.getTld());
}
@ -58,7 +57,7 @@ class EdgeDomainTest {
@Test
public void testUkDomain2() throws URISyntaxException {
var domain = new EdgeUrl("http://marginalia.co.uk");
assertEquals("marginalia.co.uk", domain.domain.domain);
assertEquals("marginalia.co.uk", domain.domain.topDomain);
assertEquals("http", domain.proto);
assertEquals("", domain.domain.subDomain);
assertEquals("http://marginalia.co.uk/", domain.toString());
@ -68,7 +67,7 @@ class EdgeDomainTest {
@Test
public void testUkDomain3() throws URISyntaxException {
var domain = new EdgeUrl("http://withcandour.co.uk");
assertEquals("withcandour.co.uk", domain.domain.domain);
assertEquals("withcandour.co.uk", domain.domain.topDomain);
assertEquals("http", domain.proto);
assertEquals("", domain.domain.subDomain);
assertEquals("http://withcandour.co.uk/", domain.toString());
@ -80,7 +79,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://www.marginalia.co.uk");
assertEquals("http", domain.proto);
assertEquals("www", domain.domain.subDomain);
assertEquals("marginalia.co.uk", domain.domain.domain);
assertEquals("marginalia.co.uk", domain.domain.topDomain);
assertEquals("http://www.marginalia.co.uk/", domain.toString());
}
@ -88,7 +87,7 @@ class EdgeDomainTest {
public void testThreeLetterDomain() throws URISyntaxException {
var domain = new EdgeUrl("http://www.marginalia.abcf.de");
assertEquals("http", domain.proto);
assertEquals("abcf.de", domain.domain.domain);
assertEquals("abcf.de", domain.domain.topDomain);
assertEquals("www.marginalia", domain.domain.subDomain);
assertEquals("de", domain.domain.getTld());
}
@ -98,7 +97,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("http://marginalia.nu");
assertEquals("http", domain.proto);
assertEquals("", domain.domain.subDomain);
assertEquals("marginalia.nu", domain.domain.domain);
assertEquals("marginalia.nu", domain.domain.topDomain);
assertEquals("http://marginalia.nu/", domain.toString());
assertEquals("nu", domain.domain.getTld());
}
@ -108,7 +107,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("https://127.0.0.1:8080");
assertEquals("https", domain.proto);
assertEquals("", domain.domain.subDomain);
assertEquals("127.0.0.1", domain.domain.domain);
assertEquals("127.0.0.1", domain.domain.topDomain);
assertEquals("https://127.0.0.1:8080/", domain.toString());
assertEquals("IP", domain.domain.getTld());
}
@ -118,7 +117,7 @@ class EdgeDomainTest {
var domain = new EdgeUrl("https://192.168.1.32");
assertEquals("https", domain.proto);
assertEquals("", domain.domain.subDomain);
assertEquals("192.168.1.32", domain.domain.domain);
assertEquals("192.168.1.32", domain.domain.topDomain);
assertEquals("https://192.168.1.32/", domain.toString());
assertEquals("IP", domain.domain.getTld());
}

View File

@ -62,7 +62,7 @@ public class IpBlockList {
if (blocklistDisabled)
return true;
if (domain.domain.endsWith(".cn")) {
if (domain.topDomain.endsWith(".cn")) {
logger.debug("Blocking {} on .cn-end", domain);
return false;
}

View File

@ -67,7 +67,7 @@ public class UrlBlocklist {
public boolean isUrlBlocked(EdgeUrl url) {
try {
if (badDomains.contains(url.domain.domain)) {
if (badDomains.contains(url.domain.topDomain)) {
return true;
}
@ -76,7 +76,7 @@ public class UrlBlocklist {
return true;
}
if ("github.com".equals(url.domain.domain)) {
if ("github.com".equals(url.domain.topDomain)) {
return url.path.chars().filter(c -> c == '/').count() > 2;
}

View File

@ -10,7 +10,7 @@ public record BrowseResult (EdgeUrl url,
public String domainHash() {
var domain = url.domain;
if ("www".equals(domain.subDomain)) {
return domain.domain;
return domain.topDomain;
}
return domain.toString();
}
@ -19,7 +19,7 @@ public record BrowseResult (EdgeUrl url,
String ret;
var domain = url.domain;
if ("www".equals(domain.subDomain)) {
ret = domain.domain;
ret = domain.topDomain;
}
else {
ret = domain.toString();

View File

@ -14,7 +14,6 @@ import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.links.TopKeywords;
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
import nu.marginalia.model.crawl.HtmlFeature;
@ -161,10 +160,10 @@ public class DomainProcessor {
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
private boolean isAcademicDomain(EdgeDomain domain) {
if (domain.domain.endsWith(".edu"))
if (domain.topDomain.endsWith(".edu"))
return true;
if (academicPattern.matcher(domain.domain).matches())
if (academicPattern.matcher(domain.topDomain).matches())
return true;
return false;

View File

@ -50,7 +50,7 @@ public abstract class AbstractDocumentProcessorPlugin {
public MetaTagsBuilder addUrl(EdgeUrl url) {
add("proto", url.proto);
add("site", url.domain);
add("site", url.domain.domain);
add("site", url.domain.topDomain);
add("tld", url.domain.getTld());
if (url.path.startsWith("/~")) {

View File

@ -291,7 +291,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
for (var fd : lp.getForeignDomains()) {
linkTerms.add("links:"+fd.toString().toLowerCase());
linkTerms.add("links:"+fd.getDomain().toLowerCase());
linkTerms.add("links:"+fd.getTopDomain().toLowerCase());
}
return linkTerms;

View File

@ -54,7 +54,7 @@ public class HtmlProcessorSpecializations {
return blogSpecialization;
}
if (url.domain.getDomain().equals("mariadb.com")
if (url.domain.getTopDomain().equals("mariadb.com")
&& url.path.startsWith("/kb")) {
return mariadbKbSpecialization;
}

View File

@ -33,7 +33,7 @@ class RssCrawlerTest {
var href = element.attr("href");
if (href != null && !href.isBlank()) {
lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
.ifPresent(urls::add);
}
});
@ -42,7 +42,7 @@ class RssCrawlerTest {
var href = element.text();
if (href != null && !href.isBlank()) {
lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
.ifPresent(urls::add);
}
});
@ -51,7 +51,7 @@ class RssCrawlerTest {
var href = element.text();
if (href != null && !href.isBlank()) {
lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
.ifPresent(urls::add);
}
});

View File

@ -147,7 +147,7 @@ public class DomainLoaderService {
public void accept(EdgeDomain domain) throws SQLException {
statement.setString(1, domain.toString());
statement.setString(2, domain.domain);
statement.setString(2, domain.topDomain);
statement.setInt(3, nodeAffinity);
statement.addBatch();

View File

@ -81,7 +81,7 @@ public class ControlBlacklistService {
""")) {
stmt.setString(1, domain.toString());
stmt.addBatch();
stmt.setString(1, domain.domain);
stmt.setString(1, domain.topDomain);
stmt.addBatch();
stmt.executeBatch();
}

View File

@ -157,7 +157,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
return false;
// This is an artifact of the link parser typically
if ("example.com".equals(url.domain.domain))
if ("example.com".equals(url.domain.topDomain))
return false;
if (linkText.contains(url.domain.toString()))

View File

@ -61,7 +61,7 @@ public class DomainListRefreshService {
for (var domain : domainsAll) {
var parsed = new EdgeDomain(domain);
insert.setString(1, domain.toLowerCase());
insert.setString(2, parsed.domain);
insert.setString(2, parsed.topDomain);
insert.setInt(3, nodeId);
insert.addBatch();
}