(*) Rename EdgeDomain$domain into topDomain
This variable had a very confusing name, and was dangerously easy to use in the wrong place with the result of getting something that only works as expected half the time. Ideally this class needs an overhaul, the assumptions it makes about domain names aren't great.
This commit is contained in:
parent
edf9aa2c23
commit
bf44805e69
@ -15,7 +15,7 @@ public class EdgeDomain implements Serializable {
|
||||
@Nonnull
|
||||
public final String subDomain;
|
||||
@Nonnull
|
||||
public final String domain;
|
||||
public final String topDomain;
|
||||
|
||||
@SneakyThrows
|
||||
public EdgeDomain(String host) {
|
||||
@ -27,13 +27,13 @@ public class EdgeDomain implements Serializable {
|
||||
|
||||
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
|
||||
subDomain = "";
|
||||
domain = host;
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
int dot2 = host.substring(0, dot).lastIndexOf('.');
|
||||
if (dot2 < 0) {
|
||||
subDomain = "";
|
||||
domain = host;
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
if (looksLikeGovTld(host))
|
||||
@ -42,16 +42,16 @@ public class EdgeDomain implements Serializable {
|
||||
if (dot3 >= 0) {
|
||||
dot2 = dot3;
|
||||
subDomain = host.substring(0, dot2);
|
||||
domain = host.substring(dot2 + 1);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
else {
|
||||
subDomain = "";
|
||||
domain = host;
|
||||
topDomain = host;
|
||||
}
|
||||
}
|
||||
else {
|
||||
subDomain = host.substring(0, dot2);
|
||||
domain = host.substring(dot2 + 1);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -97,28 +97,28 @@ public class EdgeDomain implements Serializable {
|
||||
|
||||
public String getAddress() {
|
||||
if (!subDomain.isEmpty()) {
|
||||
return subDomain + "." + domain;
|
||||
return subDomain + "." + topDomain;
|
||||
}
|
||||
return domain;
|
||||
return topDomain;
|
||||
}
|
||||
|
||||
public String getDomainKey() {
|
||||
int cutPoint = domain.indexOf('.');
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
return domain;
|
||||
return topDomain;
|
||||
}
|
||||
return domain.substring(0, cutPoint).toLowerCase();
|
||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
||||
}
|
||||
|
||||
public String getLongDomainKey() {
|
||||
StringBuilder ret = new StringBuilder();
|
||||
|
||||
int cutPoint = domain.indexOf('.');
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
ret.append(domain);
|
||||
ret.append(topDomain);
|
||||
}
|
||||
else {
|
||||
ret.append(domain, 0, cutPoint);
|
||||
ret.append(topDomain, 0, cutPoint);
|
||||
}
|
||||
|
||||
if (!"".equals(subDomain) && !"www".equals(subDomain)) {
|
||||
@ -133,30 +133,30 @@ public class EdgeDomain implements Serializable {
|
||||
public boolean hasSameTopDomain(EdgeDomain other) {
|
||||
if (other == null) return false;
|
||||
|
||||
return domain.equalsIgnoreCase(other.domain);
|
||||
return topDomain.equalsIgnoreCase(other.topDomain);
|
||||
}
|
||||
|
||||
public String getTld() {
|
||||
int dot = -1;
|
||||
int length = domain.length();
|
||||
int length = topDomain.length();
|
||||
|
||||
if (ipPatternTest.test(domain)) {
|
||||
if (ipPatternTest.test(topDomain)) {
|
||||
return "IP";
|
||||
}
|
||||
|
||||
if (govListTest.test(domain)) {
|
||||
dot = domain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
|
||||
if (govListTest.test(topDomain)) {
|
||||
dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
|
||||
}
|
||||
else {
|
||||
dot = domain.lastIndexOf('.');
|
||||
dot = topDomain.lastIndexOf('.');
|
||||
}
|
||||
|
||||
|
||||
if (dot < 0 || dot == domain.length() - 1) {
|
||||
if (dot < 0 || dot == topDomain.length() - 1) {
|
||||
return "-";
|
||||
}
|
||||
else {
|
||||
return domain.substring(dot + 1);
|
||||
return topDomain.substring(dot + 1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -166,8 +166,8 @@ public class EdgeDomain implements Serializable {
|
||||
final String this$subDomain = this.getSubDomain();
|
||||
final String other$subDomain = other.getSubDomain();
|
||||
if (!Objects.equals(this$subDomain,other$subDomain)) return false;
|
||||
final String this$domain = this.getDomain();
|
||||
final String other$domain = other.getDomain();
|
||||
final String this$domain = this.getTopDomain();
|
||||
final String other$domain = other.getTopDomain();
|
||||
if (!Objects.equals(this$domain,other$domain)) return false;
|
||||
return true;
|
||||
}
|
||||
@ -177,7 +177,7 @@ public class EdgeDomain implements Serializable {
|
||||
int result = 1;
|
||||
final Object $subDomain = this.getSubDomain().toLowerCase();
|
||||
result = result * PRIME + $subDomain.hashCode();
|
||||
final Object $domain = this.getDomain().toLowerCase();
|
||||
final Object $domain = this.getTopDomain().toLowerCase();
|
||||
result = result * PRIME + $domain.hashCode();
|
||||
return result;
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
@ -22,7 +21,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("l7072i3", domain.domain.subDomain);
|
||||
assertEquals("l7c.net", domain.domain.domain);
|
||||
assertEquals("l7c.net", domain.domain.topDomain);
|
||||
assertEquals("net", domain.domain.getTld());
|
||||
}
|
||||
|
||||
@ -31,7 +30,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://endless.horse/");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("endless.horse", domain.domain.domain);
|
||||
assertEquals("endless.horse", domain.domain.topDomain);
|
||||
assertEquals("horse", domain.domain.getTld());
|
||||
}
|
||||
|
||||
@ -40,7 +39,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://uj.edu.pl");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("uj.edu.pl", domain.domain.domain);
|
||||
assertEquals("uj.edu.pl", domain.domain.topDomain);
|
||||
assertEquals("edu.pl", domain.domain.getTld());
|
||||
}
|
||||
|
||||
@ -50,7 +49,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://www.marginalia.nu");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("www", domain.domain.subDomain);
|
||||
assertEquals("marginalia.nu", domain.domain.domain);
|
||||
assertEquals("marginalia.nu", domain.domain.topDomain);
|
||||
assertEquals("http://www.marginalia.nu/", domain.toString());
|
||||
assertEquals("nu", domain.domain.getTld());
|
||||
}
|
||||
@ -58,7 +57,7 @@ class EdgeDomainTest {
|
||||
@Test
|
||||
public void testUkDomain2() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://marginalia.co.uk");
|
||||
assertEquals("marginalia.co.uk", domain.domain.domain);
|
||||
assertEquals("marginalia.co.uk", domain.domain.topDomain);
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("http://marginalia.co.uk/", domain.toString());
|
||||
@ -68,7 +67,7 @@ class EdgeDomainTest {
|
||||
@Test
|
||||
public void testUkDomain3() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://withcandour.co.uk");
|
||||
assertEquals("withcandour.co.uk", domain.domain.domain);
|
||||
assertEquals("withcandour.co.uk", domain.domain.topDomain);
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("http://withcandour.co.uk/", domain.toString());
|
||||
@ -80,7 +79,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://www.marginalia.co.uk");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("www", domain.domain.subDomain);
|
||||
assertEquals("marginalia.co.uk", domain.domain.domain);
|
||||
assertEquals("marginalia.co.uk", domain.domain.topDomain);
|
||||
assertEquals("http://www.marginalia.co.uk/", domain.toString());
|
||||
}
|
||||
|
||||
@ -88,7 +87,7 @@ class EdgeDomainTest {
|
||||
public void testThreeLetterDomain() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://www.marginalia.abcf.de");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("abcf.de", domain.domain.domain);
|
||||
assertEquals("abcf.de", domain.domain.topDomain);
|
||||
assertEquals("www.marginalia", domain.domain.subDomain);
|
||||
assertEquals("de", domain.domain.getTld());
|
||||
}
|
||||
@ -98,7 +97,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://marginalia.nu");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("marginalia.nu", domain.domain.domain);
|
||||
assertEquals("marginalia.nu", domain.domain.topDomain);
|
||||
assertEquals("http://marginalia.nu/", domain.toString());
|
||||
assertEquals("nu", domain.domain.getTld());
|
||||
}
|
||||
@ -108,7 +107,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("https://127.0.0.1:8080");
|
||||
assertEquals("https", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("127.0.0.1", domain.domain.domain);
|
||||
assertEquals("127.0.0.1", domain.domain.topDomain);
|
||||
assertEquals("https://127.0.0.1:8080/", domain.toString());
|
||||
assertEquals("IP", domain.domain.getTld());
|
||||
}
|
||||
@ -118,7 +117,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("https://192.168.1.32");
|
||||
assertEquals("https", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("192.168.1.32", domain.domain.domain);
|
||||
assertEquals("192.168.1.32", domain.domain.topDomain);
|
||||
assertEquals("https://192.168.1.32/", domain.toString());
|
||||
assertEquals("IP", domain.domain.getTld());
|
||||
}
|
||||
|
@ -62,7 +62,7 @@ public class IpBlockList {
|
||||
if (blocklistDisabled)
|
||||
return true;
|
||||
|
||||
if (domain.domain.endsWith(".cn")) {
|
||||
if (domain.topDomain.endsWith(".cn")) {
|
||||
logger.debug("Blocking {} on .cn-end", domain);
|
||||
return false;
|
||||
}
|
||||
|
@ -67,7 +67,7 @@ public class UrlBlocklist {
|
||||
|
||||
public boolean isUrlBlocked(EdgeUrl url) {
|
||||
try {
|
||||
if (badDomains.contains(url.domain.domain)) {
|
||||
if (badDomains.contains(url.domain.topDomain)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -76,7 +76,7 @@ public class UrlBlocklist {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ("github.com".equals(url.domain.domain)) {
|
||||
if ("github.com".equals(url.domain.topDomain)) {
|
||||
return url.path.chars().filter(c -> c == '/').count() > 2;
|
||||
}
|
||||
|
||||
|
@ -10,7 +10,7 @@ public record BrowseResult (EdgeUrl url,
|
||||
public String domainHash() {
|
||||
var domain = url.domain;
|
||||
if ("www".equals(domain.subDomain)) {
|
||||
return domain.domain;
|
||||
return domain.topDomain;
|
||||
}
|
||||
return domain.toString();
|
||||
}
|
||||
@ -19,7 +19,7 @@ public record BrowseResult (EdgeUrl url,
|
||||
String ret;
|
||||
var domain = url.domain;
|
||||
if ("www".equals(domain.subDomain)) {
|
||||
ret = domain.domain;
|
||||
ret = domain.topDomain;
|
||||
}
|
||||
else {
|
||||
ret = domain.toString();
|
||||
|
@ -14,7 +14,6 @@ import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
||||
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
@ -161,10 +160,10 @@ public class DomainProcessor {
|
||||
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
|
||||
private boolean isAcademicDomain(EdgeDomain domain) {
|
||||
|
||||
if (domain.domain.endsWith(".edu"))
|
||||
if (domain.topDomain.endsWith(".edu"))
|
||||
return true;
|
||||
|
||||
if (academicPattern.matcher(domain.domain).matches())
|
||||
if (academicPattern.matcher(domain.topDomain).matches())
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
@ -50,7 +50,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
||||
public MetaTagsBuilder addUrl(EdgeUrl url) {
|
||||
add("proto", url.proto);
|
||||
add("site", url.domain);
|
||||
add("site", url.domain.domain);
|
||||
add("site", url.domain.topDomain);
|
||||
add("tld", url.domain.getTld());
|
||||
|
||||
if (url.path.startsWith("/~")) {
|
||||
|
@ -291,7 +291,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
for (var fd : lp.getForeignDomains()) {
|
||||
linkTerms.add("links:"+fd.toString().toLowerCase());
|
||||
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
||||
linkTerms.add("links:"+fd.getTopDomain().toLowerCase());
|
||||
}
|
||||
|
||||
return linkTerms;
|
||||
|
@ -54,7 +54,7 @@ public class HtmlProcessorSpecializations {
|
||||
return blogSpecialization;
|
||||
}
|
||||
|
||||
if (url.domain.getDomain().equals("mariadb.com")
|
||||
if (url.domain.getTopDomain().equals("mariadb.com")
|
||||
&& url.path.startsWith("/kb")) {
|
||||
return mariadbKbSpecialization;
|
||||
}
|
||||
|
@ -33,7 +33,7 @@ class RssCrawlerTest {
|
||||
var href = element.attr("href");
|
||||
if (href != null && !href.isBlank()) {
|
||||
lp.parseLink(base, href)
|
||||
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
|
||||
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
||||
.ifPresent(urls::add);
|
||||
}
|
||||
});
|
||||
@ -42,7 +42,7 @@ class RssCrawlerTest {
|
||||
var href = element.text();
|
||||
if (href != null && !href.isBlank()) {
|
||||
lp.parseLink(base, href)
|
||||
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
|
||||
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
||||
.ifPresent(urls::add);
|
||||
}
|
||||
});
|
||||
@ -51,7 +51,7 @@ class RssCrawlerTest {
|
||||
var href = element.text();
|
||||
if (href != null && !href.isBlank()) {
|
||||
lp.parseLink(base, href)
|
||||
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
|
||||
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
||||
.ifPresent(urls::add);
|
||||
}
|
||||
});
|
||||
|
@ -147,7 +147,7 @@ public class DomainLoaderService {
|
||||
|
||||
public void accept(EdgeDomain domain) throws SQLException {
|
||||
statement.setString(1, domain.toString());
|
||||
statement.setString(2, domain.domain);
|
||||
statement.setString(2, domain.topDomain);
|
||||
statement.setInt(3, nodeAffinity);
|
||||
statement.addBatch();
|
||||
|
||||
|
@ -81,7 +81,7 @@ public class ControlBlacklistService {
|
||||
""")) {
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.addBatch();
|
||||
stmt.setString(1, domain.domain);
|
||||
stmt.setString(1, domain.topDomain);
|
||||
stmt.addBatch();
|
||||
stmt.executeBatch();
|
||||
}
|
||||
|
@ -157,7 +157,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
||||
return false;
|
||||
|
||||
// This is an artifact of the link parser typically
|
||||
if ("example.com".equals(url.domain.domain))
|
||||
if ("example.com".equals(url.domain.topDomain))
|
||||
return false;
|
||||
|
||||
if (linkText.contains(url.domain.toString()))
|
||||
|
@ -61,7 +61,7 @@ public class DomainListRefreshService {
|
||||
for (var domain : domainsAll) {
|
||||
var parsed = new EdgeDomain(domain);
|
||||
insert.setString(1, domain.toLowerCase());
|
||||
insert.setString(2, parsed.domain);
|
||||
insert.setString(2, parsed.topDomain);
|
||||
insert.setInt(3, nodeId);
|
||||
insert.addBatch();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user