Merge branch 'master' into asn-info
This commit is contained in:
commit
bde68ba48b
@ -15,7 +15,7 @@ public class EdgeDomain implements Serializable {
|
||||
@Nonnull
|
||||
public final String subDomain;
|
||||
@Nonnull
|
||||
public final String domain;
|
||||
public final String topDomain;
|
||||
|
||||
@SneakyThrows
|
||||
public EdgeDomain(String host) {
|
||||
@ -27,13 +27,13 @@ public class EdgeDomain implements Serializable {
|
||||
|
||||
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
|
||||
subDomain = "";
|
||||
domain = host;
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
int dot2 = host.substring(0, dot).lastIndexOf('.');
|
||||
if (dot2 < 0) {
|
||||
subDomain = "";
|
||||
domain = host;
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
if (looksLikeGovTld(host))
|
||||
@ -42,16 +42,16 @@ public class EdgeDomain implements Serializable {
|
||||
if (dot3 >= 0) {
|
||||
dot2 = dot3;
|
||||
subDomain = host.substring(0, dot2);
|
||||
domain = host.substring(dot2 + 1);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
else {
|
||||
subDomain = "";
|
||||
domain = host;
|
||||
topDomain = host;
|
||||
}
|
||||
}
|
||||
else {
|
||||
subDomain = host.substring(0, dot2);
|
||||
domain = host.substring(dot2 + 1);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -97,28 +97,28 @@ public class EdgeDomain implements Serializable {
|
||||
|
||||
public String getAddress() {
|
||||
if (!subDomain.isEmpty()) {
|
||||
return subDomain + "." + domain;
|
||||
return subDomain + "." + topDomain;
|
||||
}
|
||||
return domain;
|
||||
return topDomain;
|
||||
}
|
||||
|
||||
public String getDomainKey() {
|
||||
int cutPoint = domain.indexOf('.');
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
return domain;
|
||||
return topDomain;
|
||||
}
|
||||
return domain.substring(0, cutPoint).toLowerCase();
|
||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
||||
}
|
||||
|
||||
public String getLongDomainKey() {
|
||||
StringBuilder ret = new StringBuilder();
|
||||
|
||||
int cutPoint = domain.indexOf('.');
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
ret.append(domain);
|
||||
ret.append(topDomain);
|
||||
}
|
||||
else {
|
||||
ret.append(domain, 0, cutPoint);
|
||||
ret.append(topDomain, 0, cutPoint);
|
||||
}
|
||||
|
||||
if (!"".equals(subDomain) && !"www".equals(subDomain)) {
|
||||
@ -133,30 +133,30 @@ public class EdgeDomain implements Serializable {
|
||||
public boolean hasSameTopDomain(EdgeDomain other) {
|
||||
if (other == null) return false;
|
||||
|
||||
return domain.equalsIgnoreCase(other.domain);
|
||||
return topDomain.equalsIgnoreCase(other.topDomain);
|
||||
}
|
||||
|
||||
public String getTld() {
|
||||
int dot = -1;
|
||||
int length = domain.length();
|
||||
int length = topDomain.length();
|
||||
|
||||
if (ipPatternTest.test(domain)) {
|
||||
if (ipPatternTest.test(topDomain)) {
|
||||
return "IP";
|
||||
}
|
||||
|
||||
if (govListTest.test(domain)) {
|
||||
dot = domain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
|
||||
if (govListTest.test(topDomain)) {
|
||||
dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
|
||||
}
|
||||
else {
|
||||
dot = domain.lastIndexOf('.');
|
||||
dot = topDomain.lastIndexOf('.');
|
||||
}
|
||||
|
||||
|
||||
if (dot < 0 || dot == domain.length() - 1) {
|
||||
if (dot < 0 || dot == topDomain.length() - 1) {
|
||||
return "-";
|
||||
}
|
||||
else {
|
||||
return domain.substring(dot + 1);
|
||||
return topDomain.substring(dot + 1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -166,8 +166,8 @@ public class EdgeDomain implements Serializable {
|
||||
final String this$subDomain = this.getSubDomain();
|
||||
final String other$subDomain = other.getSubDomain();
|
||||
if (!Objects.equals(this$subDomain,other$subDomain)) return false;
|
||||
final String this$domain = this.getDomain();
|
||||
final String other$domain = other.getDomain();
|
||||
final String this$domain = this.getTopDomain();
|
||||
final String other$domain = other.getTopDomain();
|
||||
if (!Objects.equals(this$domain,other$domain)) return false;
|
||||
return true;
|
||||
}
|
||||
@ -177,7 +177,7 @@ public class EdgeDomain implements Serializable {
|
||||
int result = 1;
|
||||
final Object $subDomain = this.getSubDomain().toLowerCase();
|
||||
result = result * PRIME + $subDomain.hashCode();
|
||||
final Object $domain = this.getDomain().toLowerCase();
|
||||
final Object $domain = this.getTopDomain().toLowerCase();
|
||||
result = result * PRIME + $domain.hashCode();
|
||||
return result;
|
||||
}
|
||||
|
@ -8,7 +8,7 @@ public enum DocumentFlags {
|
||||
GeneratorDocs,
|
||||
GeneratorForum,
|
||||
GeneratorWiki,
|
||||
Unused6,
|
||||
Sideloaded,
|
||||
Unused7,
|
||||
Unused8,
|
||||
;
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
@ -22,7 +21,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("l7072i3", domain.domain.subDomain);
|
||||
assertEquals("l7c.net", domain.domain.domain);
|
||||
assertEquals("l7c.net", domain.domain.topDomain);
|
||||
assertEquals("net", domain.domain.getTld());
|
||||
}
|
||||
|
||||
@ -31,7 +30,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://endless.horse/");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("endless.horse", domain.domain.domain);
|
||||
assertEquals("endless.horse", domain.domain.topDomain);
|
||||
assertEquals("horse", domain.domain.getTld());
|
||||
}
|
||||
|
||||
@ -40,7 +39,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://uj.edu.pl");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("uj.edu.pl", domain.domain.domain);
|
||||
assertEquals("uj.edu.pl", domain.domain.topDomain);
|
||||
assertEquals("edu.pl", domain.domain.getTld());
|
||||
}
|
||||
|
||||
@ -50,7 +49,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://www.marginalia.nu");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("www", domain.domain.subDomain);
|
||||
assertEquals("marginalia.nu", domain.domain.domain);
|
||||
assertEquals("marginalia.nu", domain.domain.topDomain);
|
||||
assertEquals("http://www.marginalia.nu/", domain.toString());
|
||||
assertEquals("nu", domain.domain.getTld());
|
||||
}
|
||||
@ -58,7 +57,7 @@ class EdgeDomainTest {
|
||||
@Test
|
||||
public void testUkDomain2() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://marginalia.co.uk");
|
||||
assertEquals("marginalia.co.uk", domain.domain.domain);
|
||||
assertEquals("marginalia.co.uk", domain.domain.topDomain);
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("http://marginalia.co.uk/", domain.toString());
|
||||
@ -68,7 +67,7 @@ class EdgeDomainTest {
|
||||
@Test
|
||||
public void testUkDomain3() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://withcandour.co.uk");
|
||||
assertEquals("withcandour.co.uk", domain.domain.domain);
|
||||
assertEquals("withcandour.co.uk", domain.domain.topDomain);
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("http://withcandour.co.uk/", domain.toString());
|
||||
@ -80,7 +79,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://www.marginalia.co.uk");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("www", domain.domain.subDomain);
|
||||
assertEquals("marginalia.co.uk", domain.domain.domain);
|
||||
assertEquals("marginalia.co.uk", domain.domain.topDomain);
|
||||
assertEquals("http://www.marginalia.co.uk/", domain.toString());
|
||||
}
|
||||
|
||||
@ -88,7 +87,7 @@ class EdgeDomainTest {
|
||||
public void testThreeLetterDomain() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://www.marginalia.abcf.de");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("abcf.de", domain.domain.domain);
|
||||
assertEquals("abcf.de", domain.domain.topDomain);
|
||||
assertEquals("www.marginalia", domain.domain.subDomain);
|
||||
assertEquals("de", domain.domain.getTld());
|
||||
}
|
||||
@ -98,7 +97,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("http://marginalia.nu");
|
||||
assertEquals("http", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("marginalia.nu", domain.domain.domain);
|
||||
assertEquals("marginalia.nu", domain.domain.topDomain);
|
||||
assertEquals("http://marginalia.nu/", domain.toString());
|
||||
assertEquals("nu", domain.domain.getTld());
|
||||
}
|
||||
@ -108,7 +107,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("https://127.0.0.1:8080");
|
||||
assertEquals("https", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("127.0.0.1", domain.domain.domain);
|
||||
assertEquals("127.0.0.1", domain.domain.topDomain);
|
||||
assertEquals("https://127.0.0.1:8080/", domain.toString());
|
||||
assertEquals("IP", domain.domain.getTld());
|
||||
}
|
||||
@ -118,7 +117,7 @@ class EdgeDomainTest {
|
||||
var domain = new EdgeUrl("https://192.168.1.32");
|
||||
assertEquals("https", domain.proto);
|
||||
assertEquals("", domain.domain.subDomain);
|
||||
assertEquals("192.168.1.32", domain.domain.domain);
|
||||
assertEquals("192.168.1.32", domain.domain.topDomain);
|
||||
assertEquals("https://192.168.1.32/", domain.toString());
|
||||
assertEquals("IP", domain.domain.getTld());
|
||||
}
|
||||
|
@ -62,7 +62,7 @@ public class IpBlockList {
|
||||
if (blocklistDisabled)
|
||||
return true;
|
||||
|
||||
if (domain.domain.endsWith(".cn")) {
|
||||
if (domain.topDomain.endsWith(".cn")) {
|
||||
logger.debug("Blocking {} on .cn-end", domain);
|
||||
return false;
|
||||
}
|
||||
|
@ -67,7 +67,7 @@ public class UrlBlocklist {
|
||||
|
||||
public boolean isUrlBlocked(EdgeUrl url) {
|
||||
try {
|
||||
if (badDomains.contains(url.domain.domain)) {
|
||||
if (badDomains.contains(url.domain.topDomain)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -76,7 +76,7 @@ public class UrlBlocklist {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ("github.com".equals(url.domain.domain)) {
|
||||
if ("github.com".equals(url.domain.topDomain)) {
|
||||
return url.path.chars().filter(c -> c == '/').count() > 2;
|
||||
}
|
||||
|
||||
|
@ -10,7 +10,7 @@ public record BrowseResult (EdgeUrl url,
|
||||
public String domainHash() {
|
||||
var domain = url.domain;
|
||||
if ("www".equals(domain.subDomain)) {
|
||||
return domain.domain;
|
||||
return domain.topDomain;
|
||||
}
|
||||
return domain.toString();
|
||||
}
|
||||
@ -19,7 +19,7 @@ public record BrowseResult (EdgeUrl url,
|
||||
String ret;
|
||||
var domain = url.domain;
|
||||
if ("www".equals(domain.subDomain)) {
|
||||
ret = domain.domain;
|
||||
ret = domain.topDomain;
|
||||
}
|
||||
else {
|
||||
ret = domain.toString();
|
||||
|
@ -52,7 +52,7 @@ public class CrawledDocument implements SerializableCrawlData {
|
||||
return EdgeUrl
|
||||
.parse(url)
|
||||
.map(EdgeUrl::getDomain)
|
||||
.map(d -> d.domain)
|
||||
.map(Object::toString)
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
|
@ -14,7 +14,6 @@ import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
||||
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
@ -161,10 +160,10 @@ public class DomainProcessor {
|
||||
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
|
||||
private boolean isAcademicDomain(EdgeDomain domain) {
|
||||
|
||||
if (domain.domain.endsWith(".edu"))
|
||||
if (domain.topDomain.endsWith(".edu"))
|
||||
return true;
|
||||
|
||||
if (academicPattern.matcher(domain.domain).matches())
|
||||
if (academicPattern.matcher(domain.topDomain).matches())
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
@ -50,7 +50,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
||||
public MetaTagsBuilder addUrl(EdgeUrl url) {
|
||||
add("proto", url.proto);
|
||||
add("site", url.domain);
|
||||
add("site", url.domain.domain);
|
||||
add("site", url.domain.topDomain);
|
||||
add("tld", url.domain.getTld());
|
||||
|
||||
if (url.path.startsWith("/~")) {
|
||||
|
@ -291,7 +291,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
for (var fd : lp.getForeignDomains()) {
|
||||
linkTerms.add("links:"+fd.toString().toLowerCase());
|
||||
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
||||
linkTerms.add("links:"+fd.getTopDomain().toLowerCase());
|
||||
}
|
||||
|
||||
return linkTerms;
|
||||
|
@ -54,7 +54,7 @@ public class HtmlProcessorSpecializations {
|
||||
return blogSpecialization;
|
||||
}
|
||||
|
||||
if (url.domain.getDomain().equals("mariadb.com")
|
||||
if (url.domain.getTopDomain().equals("mariadb.com")
|
||||
&& url.path.startsWith("/kb")) {
|
||||
return mariadbKbSpecialization;
|
||||
}
|
||||
|
@ -63,6 +63,11 @@ public class SideloaderProcessing {
|
||||
for (String keyword : extraKeywords)
|
||||
ret.words.add(keyword, WordFlags.Subjects.asBit());
|
||||
|
||||
if (type == GeneratorType.WIKI)
|
||||
ret.words.add("generator:wiki", WordFlags.Subjects.asBit());
|
||||
else if (type == GeneratorType.DOCS)
|
||||
ret.words.add("generator:docs", WordFlags.Subjects.asBit());
|
||||
|
||||
ret.details = details.details();
|
||||
|
||||
// Add a few things that we know about the document
|
||||
@ -80,8 +85,8 @@ public class SideloaderProcessing {
|
||||
PubDate.toYearByte(ret.details.pubYear),
|
||||
(int) -ret.details.quality,
|
||||
switch (type) {
|
||||
case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki);
|
||||
case DOCS -> EnumSet.of(DocumentFlags.GeneratorDocs);
|
||||
case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki, DocumentFlags.Sideloaded);
|
||||
case DOCS -> EnumSet.of(DocumentFlags.GeneratorDocs, DocumentFlags.Sideloaded);
|
||||
default -> EnumSet.noneOf(DocumentFlags.class);
|
||||
});
|
||||
|
||||
|
@ -110,13 +110,18 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
String fullUrl = baseUrl.toString() + url;
|
||||
|
||||
StringBuilder fullHtml = new StringBuilder();
|
||||
fullHtml.append("<!DOCTYPE html><html><head><title>").append(title).append("</title></head><body>");
|
||||
fullHtml
|
||||
.append("<!DOCTYPE html><html><head><title>")
|
||||
.append(title)
|
||||
.append("</title></head><body>")
|
||||
.append("<div class=\"mw-content-text\">");
|
||||
|
||||
for (String part : parts) {
|
||||
fullHtml.append("<p>");
|
||||
fullHtml.append(part);
|
||||
fullHtml.append("</p>");
|
||||
}
|
||||
fullHtml.append("</body></html>");
|
||||
fullHtml.append("</div></body></html>");
|
||||
|
||||
var doc = sideloaderProcessing
|
||||
.processDocument(fullUrl,
|
||||
|
@ -115,8 +115,9 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
ret.words = keywordExtractor.extractKeywords(dld, url);
|
||||
ret.words.addAllSyntheticTerms(List.of(
|
||||
"site:" + domainName,
|
||||
"site:" + url.domain.domain,
|
||||
url.domain.domain
|
||||
"site:" + url.domain.topDomain,
|
||||
url.domain.topDomain,
|
||||
domainName
|
||||
));
|
||||
|
||||
if (!post.tags().isBlank()) {
|
||||
|
@ -33,7 +33,7 @@ class RssCrawlerTest {
|
||||
var href = element.attr("href");
|
||||
if (href != null && !href.isBlank()) {
|
||||
lp.parseLink(base, href)
|
||||
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
|
||||
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
||||
.ifPresent(urls::add);
|
||||
}
|
||||
});
|
||||
@ -42,7 +42,7 @@ class RssCrawlerTest {
|
||||
var href = element.text();
|
||||
if (href != null && !href.isBlank()) {
|
||||
lp.parseLink(base, href)
|
||||
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
|
||||
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
||||
.ifPresent(urls::add);
|
||||
}
|
||||
});
|
||||
@ -51,7 +51,7 @@ class RssCrawlerTest {
|
||||
var href = element.text();
|
||||
if (href != null && !href.isBlank()) {
|
||||
lp.parseLink(base, href)
|
||||
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
|
||||
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
||||
.ifPresent(urls::add);
|
||||
}
|
||||
});
|
||||
|
@ -147,7 +147,7 @@ public class DomainLoaderService {
|
||||
|
||||
public void accept(EdgeDomain domain) throws SQLException {
|
||||
statement.setString(1, domain.toString());
|
||||
statement.setString(2, domain.domain);
|
||||
statement.setString(2, domain.topDomain);
|
||||
statement.setInt(3, nodeAffinity);
|
||||
statement.addBatch();
|
||||
|
||||
|
@ -30,7 +30,7 @@ public class SearchQueryParamFactory {
|
||||
profile.getSizeLimit(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(2, 100, 200, 8192),
|
||||
new QueryLimits(1, 100, 200, 8192),
|
||||
profile.searchSetIdentifier
|
||||
);
|
||||
|
||||
|
@ -81,7 +81,7 @@ public class ControlBlacklistService {
|
||||
""")) {
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.addBatch();
|
||||
stmt.setString(1, domain.domain);
|
||||
stmt.setString(1, domain.topDomain);
|
||||
stmt.addBatch();
|
||||
stmt.executeBatch();
|
||||
}
|
||||
|
@ -157,7 +157,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
||||
return false;
|
||||
|
||||
// This is an artifact of the link parser typically
|
||||
if ("example.com".equals(url.domain.domain))
|
||||
if ("example.com".equals(url.domain.topDomain))
|
||||
return false;
|
||||
|
||||
if (linkText.contains(url.domain.toString()))
|
||||
|
@ -61,7 +61,7 @@ public class DomainListRefreshService {
|
||||
for (var domain : domainsAll) {
|
||||
var parsed = new EdgeDomain(domain);
|
||||
insert.setString(1, domain.toLowerCase());
|
||||
insert.setString(2, parsed.domain);
|
||||
insert.setString(2, parsed.topDomain);
|
||||
insert.setInt(3, nodeId);
|
||||
insert.addBatch();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user