Merge branch 'master' into asn-info
This commit is contained in:
commit
bde68ba48b
@ -15,7 +15,7 @@ public class EdgeDomain implements Serializable {
|
|||||||
@Nonnull
|
@Nonnull
|
||||||
public final String subDomain;
|
public final String subDomain;
|
||||||
@Nonnull
|
@Nonnull
|
||||||
public final String domain;
|
public final String topDomain;
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public EdgeDomain(String host) {
|
public EdgeDomain(String host) {
|
||||||
@ -27,13 +27,13 @@ public class EdgeDomain implements Serializable {
|
|||||||
|
|
||||||
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
|
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
|
||||||
subDomain = "";
|
subDomain = "";
|
||||||
domain = host;
|
topDomain = host;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
int dot2 = host.substring(0, dot).lastIndexOf('.');
|
int dot2 = host.substring(0, dot).lastIndexOf('.');
|
||||||
if (dot2 < 0) {
|
if (dot2 < 0) {
|
||||||
subDomain = "";
|
subDomain = "";
|
||||||
domain = host;
|
topDomain = host;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (looksLikeGovTld(host))
|
if (looksLikeGovTld(host))
|
||||||
@ -42,16 +42,16 @@ public class EdgeDomain implements Serializable {
|
|||||||
if (dot3 >= 0) {
|
if (dot3 >= 0) {
|
||||||
dot2 = dot3;
|
dot2 = dot3;
|
||||||
subDomain = host.substring(0, dot2);
|
subDomain = host.substring(0, dot2);
|
||||||
domain = host.substring(dot2 + 1);
|
topDomain = host.substring(dot2 + 1);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
subDomain = "";
|
subDomain = "";
|
||||||
domain = host;
|
topDomain = host;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
subDomain = host.substring(0, dot2);
|
subDomain = host.substring(0, dot2);
|
||||||
domain = host.substring(dot2 + 1);
|
topDomain = host.substring(dot2 + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -97,28 +97,28 @@ public class EdgeDomain implements Serializable {
|
|||||||
|
|
||||||
public String getAddress() {
|
public String getAddress() {
|
||||||
if (!subDomain.isEmpty()) {
|
if (!subDomain.isEmpty()) {
|
||||||
return subDomain + "." + domain;
|
return subDomain + "." + topDomain;
|
||||||
}
|
}
|
||||||
return domain;
|
return topDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getDomainKey() {
|
public String getDomainKey() {
|
||||||
int cutPoint = domain.indexOf('.');
|
int cutPoint = topDomain.indexOf('.');
|
||||||
if (cutPoint < 0) {
|
if (cutPoint < 0) {
|
||||||
return domain;
|
return topDomain;
|
||||||
}
|
}
|
||||||
return domain.substring(0, cutPoint).toLowerCase();
|
return topDomain.substring(0, cutPoint).toLowerCase();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getLongDomainKey() {
|
public String getLongDomainKey() {
|
||||||
StringBuilder ret = new StringBuilder();
|
StringBuilder ret = new StringBuilder();
|
||||||
|
|
||||||
int cutPoint = domain.indexOf('.');
|
int cutPoint = topDomain.indexOf('.');
|
||||||
if (cutPoint < 0) {
|
if (cutPoint < 0) {
|
||||||
ret.append(domain);
|
ret.append(topDomain);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ret.append(domain, 0, cutPoint);
|
ret.append(topDomain, 0, cutPoint);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!"".equals(subDomain) && !"www".equals(subDomain)) {
|
if (!"".equals(subDomain) && !"www".equals(subDomain)) {
|
||||||
@ -133,30 +133,30 @@ public class EdgeDomain implements Serializable {
|
|||||||
public boolean hasSameTopDomain(EdgeDomain other) {
|
public boolean hasSameTopDomain(EdgeDomain other) {
|
||||||
if (other == null) return false;
|
if (other == null) return false;
|
||||||
|
|
||||||
return domain.equalsIgnoreCase(other.domain);
|
return topDomain.equalsIgnoreCase(other.topDomain);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTld() {
|
public String getTld() {
|
||||||
int dot = -1;
|
int dot = -1;
|
||||||
int length = domain.length();
|
int length = topDomain.length();
|
||||||
|
|
||||||
if (ipPatternTest.test(domain)) {
|
if (ipPatternTest.test(topDomain)) {
|
||||||
return "IP";
|
return "IP";
|
||||||
}
|
}
|
||||||
|
|
||||||
if (govListTest.test(domain)) {
|
if (govListTest.test(topDomain)) {
|
||||||
dot = domain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
|
dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
dot = domain.lastIndexOf('.');
|
dot = topDomain.lastIndexOf('.');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (dot < 0 || dot == domain.length() - 1) {
|
if (dot < 0 || dot == topDomain.length() - 1) {
|
||||||
return "-";
|
return "-";
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
return domain.substring(dot + 1);
|
return topDomain.substring(dot + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -166,8 +166,8 @@ public class EdgeDomain implements Serializable {
|
|||||||
final String this$subDomain = this.getSubDomain();
|
final String this$subDomain = this.getSubDomain();
|
||||||
final String other$subDomain = other.getSubDomain();
|
final String other$subDomain = other.getSubDomain();
|
||||||
if (!Objects.equals(this$subDomain,other$subDomain)) return false;
|
if (!Objects.equals(this$subDomain,other$subDomain)) return false;
|
||||||
final String this$domain = this.getDomain();
|
final String this$domain = this.getTopDomain();
|
||||||
final String other$domain = other.getDomain();
|
final String other$domain = other.getTopDomain();
|
||||||
if (!Objects.equals(this$domain,other$domain)) return false;
|
if (!Objects.equals(this$domain,other$domain)) return false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -177,7 +177,7 @@ public class EdgeDomain implements Serializable {
|
|||||||
int result = 1;
|
int result = 1;
|
||||||
final Object $subDomain = this.getSubDomain().toLowerCase();
|
final Object $subDomain = this.getSubDomain().toLowerCase();
|
||||||
result = result * PRIME + $subDomain.hashCode();
|
result = result * PRIME + $subDomain.hashCode();
|
||||||
final Object $domain = this.getDomain().toLowerCase();
|
final Object $domain = this.getTopDomain().toLowerCase();
|
||||||
result = result * PRIME + $domain.hashCode();
|
result = result * PRIME + $domain.hashCode();
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,7 @@ public enum DocumentFlags {
|
|||||||
GeneratorDocs,
|
GeneratorDocs,
|
||||||
GeneratorForum,
|
GeneratorForum,
|
||||||
GeneratorWiki,
|
GeneratorWiki,
|
||||||
Unused6,
|
Sideloaded,
|
||||||
Unused7,
|
Unused7,
|
||||||
Unused8,
|
Unused8,
|
||||||
;
|
;
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
package nu.marginalia.model;
|
package nu.marginalia.model;
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
@ -22,7 +21,7 @@ class EdgeDomainTest {
|
|||||||
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
||||||
assertEquals("http", domain.proto);
|
assertEquals("http", domain.proto);
|
||||||
assertEquals("l7072i3", domain.domain.subDomain);
|
assertEquals("l7072i3", domain.domain.subDomain);
|
||||||
assertEquals("l7c.net", domain.domain.domain);
|
assertEquals("l7c.net", domain.domain.topDomain);
|
||||||
assertEquals("net", domain.domain.getTld());
|
assertEquals("net", domain.domain.getTld());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -31,7 +30,7 @@ class EdgeDomainTest {
|
|||||||
var domain = new EdgeUrl("http://endless.horse/");
|
var domain = new EdgeUrl("http://endless.horse/");
|
||||||
assertEquals("http", domain.proto);
|
assertEquals("http", domain.proto);
|
||||||
assertEquals("", domain.domain.subDomain);
|
assertEquals("", domain.domain.subDomain);
|
||||||
assertEquals("endless.horse", domain.domain.domain);
|
assertEquals("endless.horse", domain.domain.topDomain);
|
||||||
assertEquals("horse", domain.domain.getTld());
|
assertEquals("horse", domain.domain.getTld());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -40,7 +39,7 @@ class EdgeDomainTest {
|
|||||||
var domain = new EdgeUrl("http://uj.edu.pl");
|
var domain = new EdgeUrl("http://uj.edu.pl");
|
||||||
assertEquals("http", domain.proto);
|
assertEquals("http", domain.proto);
|
||||||
assertEquals("", domain.domain.subDomain);
|
assertEquals("", domain.domain.subDomain);
|
||||||
assertEquals("uj.edu.pl", domain.domain.domain);
|
assertEquals("uj.edu.pl", domain.domain.topDomain);
|
||||||
assertEquals("edu.pl", domain.domain.getTld());
|
assertEquals("edu.pl", domain.domain.getTld());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -50,7 +49,7 @@ class EdgeDomainTest {
|
|||||||
var domain = new EdgeUrl("http://www.marginalia.nu");
|
var domain = new EdgeUrl("http://www.marginalia.nu");
|
||||||
assertEquals("http", domain.proto);
|
assertEquals("http", domain.proto);
|
||||||
assertEquals("www", domain.domain.subDomain);
|
assertEquals("www", domain.domain.subDomain);
|
||||||
assertEquals("marginalia.nu", domain.domain.domain);
|
assertEquals("marginalia.nu", domain.domain.topDomain);
|
||||||
assertEquals("http://www.marginalia.nu/", domain.toString());
|
assertEquals("http://www.marginalia.nu/", domain.toString());
|
||||||
assertEquals("nu", domain.domain.getTld());
|
assertEquals("nu", domain.domain.getTld());
|
||||||
}
|
}
|
||||||
@ -58,7 +57,7 @@ class EdgeDomainTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testUkDomain2() throws URISyntaxException {
|
public void testUkDomain2() throws URISyntaxException {
|
||||||
var domain = new EdgeUrl("http://marginalia.co.uk");
|
var domain = new EdgeUrl("http://marginalia.co.uk");
|
||||||
assertEquals("marginalia.co.uk", domain.domain.domain);
|
assertEquals("marginalia.co.uk", domain.domain.topDomain);
|
||||||
assertEquals("http", domain.proto);
|
assertEquals("http", domain.proto);
|
||||||
assertEquals("", domain.domain.subDomain);
|
assertEquals("", domain.domain.subDomain);
|
||||||
assertEquals("http://marginalia.co.uk/", domain.toString());
|
assertEquals("http://marginalia.co.uk/", domain.toString());
|
||||||
@ -68,7 +67,7 @@ class EdgeDomainTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testUkDomain3() throws URISyntaxException {
|
public void testUkDomain3() throws URISyntaxException {
|
||||||
var domain = new EdgeUrl("http://withcandour.co.uk");
|
var domain = new EdgeUrl("http://withcandour.co.uk");
|
||||||
assertEquals("withcandour.co.uk", domain.domain.domain);
|
assertEquals("withcandour.co.uk", domain.domain.topDomain);
|
||||||
assertEquals("http", domain.proto);
|
assertEquals("http", domain.proto);
|
||||||
assertEquals("", domain.domain.subDomain);
|
assertEquals("", domain.domain.subDomain);
|
||||||
assertEquals("http://withcandour.co.uk/", domain.toString());
|
assertEquals("http://withcandour.co.uk/", domain.toString());
|
||||||
@ -80,7 +79,7 @@ class EdgeDomainTest {
|
|||||||
var domain = new EdgeUrl("http://www.marginalia.co.uk");
|
var domain = new EdgeUrl("http://www.marginalia.co.uk");
|
||||||
assertEquals("http", domain.proto);
|
assertEquals("http", domain.proto);
|
||||||
assertEquals("www", domain.domain.subDomain);
|
assertEquals("www", domain.domain.subDomain);
|
||||||
assertEquals("marginalia.co.uk", domain.domain.domain);
|
assertEquals("marginalia.co.uk", domain.domain.topDomain);
|
||||||
assertEquals("http://www.marginalia.co.uk/", domain.toString());
|
assertEquals("http://www.marginalia.co.uk/", domain.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -88,7 +87,7 @@ class EdgeDomainTest {
|
|||||||
public void testThreeLetterDomain() throws URISyntaxException {
|
public void testThreeLetterDomain() throws URISyntaxException {
|
||||||
var domain = new EdgeUrl("http://www.marginalia.abcf.de");
|
var domain = new EdgeUrl("http://www.marginalia.abcf.de");
|
||||||
assertEquals("http", domain.proto);
|
assertEquals("http", domain.proto);
|
||||||
assertEquals("abcf.de", domain.domain.domain);
|
assertEquals("abcf.de", domain.domain.topDomain);
|
||||||
assertEquals("www.marginalia", domain.domain.subDomain);
|
assertEquals("www.marginalia", domain.domain.subDomain);
|
||||||
assertEquals("de", domain.domain.getTld());
|
assertEquals("de", domain.domain.getTld());
|
||||||
}
|
}
|
||||||
@ -98,7 +97,7 @@ class EdgeDomainTest {
|
|||||||
var domain = new EdgeUrl("http://marginalia.nu");
|
var domain = new EdgeUrl("http://marginalia.nu");
|
||||||
assertEquals("http", domain.proto);
|
assertEquals("http", domain.proto);
|
||||||
assertEquals("", domain.domain.subDomain);
|
assertEquals("", domain.domain.subDomain);
|
||||||
assertEquals("marginalia.nu", domain.domain.domain);
|
assertEquals("marginalia.nu", domain.domain.topDomain);
|
||||||
assertEquals("http://marginalia.nu/", domain.toString());
|
assertEquals("http://marginalia.nu/", domain.toString());
|
||||||
assertEquals("nu", domain.domain.getTld());
|
assertEquals("nu", domain.domain.getTld());
|
||||||
}
|
}
|
||||||
@ -108,7 +107,7 @@ class EdgeDomainTest {
|
|||||||
var domain = new EdgeUrl("https://127.0.0.1:8080");
|
var domain = new EdgeUrl("https://127.0.0.1:8080");
|
||||||
assertEquals("https", domain.proto);
|
assertEquals("https", domain.proto);
|
||||||
assertEquals("", domain.domain.subDomain);
|
assertEquals("", domain.domain.subDomain);
|
||||||
assertEquals("127.0.0.1", domain.domain.domain);
|
assertEquals("127.0.0.1", domain.domain.topDomain);
|
||||||
assertEquals("https://127.0.0.1:8080/", domain.toString());
|
assertEquals("https://127.0.0.1:8080/", domain.toString());
|
||||||
assertEquals("IP", domain.domain.getTld());
|
assertEquals("IP", domain.domain.getTld());
|
||||||
}
|
}
|
||||||
@ -118,7 +117,7 @@ class EdgeDomainTest {
|
|||||||
var domain = new EdgeUrl("https://192.168.1.32");
|
var domain = new EdgeUrl("https://192.168.1.32");
|
||||||
assertEquals("https", domain.proto);
|
assertEquals("https", domain.proto);
|
||||||
assertEquals("", domain.domain.subDomain);
|
assertEquals("", domain.domain.subDomain);
|
||||||
assertEquals("192.168.1.32", domain.domain.domain);
|
assertEquals("192.168.1.32", domain.domain.topDomain);
|
||||||
assertEquals("https://192.168.1.32/", domain.toString());
|
assertEquals("https://192.168.1.32/", domain.toString());
|
||||||
assertEquals("IP", domain.domain.getTld());
|
assertEquals("IP", domain.domain.getTld());
|
||||||
}
|
}
|
||||||
|
@ -62,7 +62,7 @@ public class IpBlockList {
|
|||||||
if (blocklistDisabled)
|
if (blocklistDisabled)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (domain.domain.endsWith(".cn")) {
|
if (domain.topDomain.endsWith(".cn")) {
|
||||||
logger.debug("Blocking {} on .cn-end", domain);
|
logger.debug("Blocking {} on .cn-end", domain);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -67,7 +67,7 @@ public class UrlBlocklist {
|
|||||||
|
|
||||||
public boolean isUrlBlocked(EdgeUrl url) {
|
public boolean isUrlBlocked(EdgeUrl url) {
|
||||||
try {
|
try {
|
||||||
if (badDomains.contains(url.domain.domain)) {
|
if (badDomains.contains(url.domain.topDomain)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -76,7 +76,7 @@ public class UrlBlocklist {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ("github.com".equals(url.domain.domain)) {
|
if ("github.com".equals(url.domain.topDomain)) {
|
||||||
return url.path.chars().filter(c -> c == '/').count() > 2;
|
return url.path.chars().filter(c -> c == '/').count() > 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ public record BrowseResult (EdgeUrl url,
|
|||||||
public String domainHash() {
|
public String domainHash() {
|
||||||
var domain = url.domain;
|
var domain = url.domain;
|
||||||
if ("www".equals(domain.subDomain)) {
|
if ("www".equals(domain.subDomain)) {
|
||||||
return domain.domain;
|
return domain.topDomain;
|
||||||
}
|
}
|
||||||
return domain.toString();
|
return domain.toString();
|
||||||
}
|
}
|
||||||
@ -19,7 +19,7 @@ public record BrowseResult (EdgeUrl url,
|
|||||||
String ret;
|
String ret;
|
||||||
var domain = url.domain;
|
var domain = url.domain;
|
||||||
if ("www".equals(domain.subDomain)) {
|
if ("www".equals(domain.subDomain)) {
|
||||||
ret = domain.domain;
|
ret = domain.topDomain;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ret = domain.toString();
|
ret = domain.toString();
|
||||||
|
@ -52,7 +52,7 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
return EdgeUrl
|
return EdgeUrl
|
||||||
.parse(url)
|
.parse(url)
|
||||||
.map(EdgeUrl::getDomain)
|
.map(EdgeUrl::getDomain)
|
||||||
.map(d -> d.domain)
|
.map(Object::toString)
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,7 +14,6 @@ import nu.marginalia.geoip.GeoIpDictionary;
|
|||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
||||||
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
@ -161,10 +160,10 @@ public class DomainProcessor {
|
|||||||
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
|
private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$");
|
||||||
private boolean isAcademicDomain(EdgeDomain domain) {
|
private boolean isAcademicDomain(EdgeDomain domain) {
|
||||||
|
|
||||||
if (domain.domain.endsWith(".edu"))
|
if (domain.topDomain.endsWith(".edu"))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (academicPattern.matcher(domain.domain).matches())
|
if (academicPattern.matcher(domain.topDomain).matches())
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
@ -50,7 +50,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
|||||||
public MetaTagsBuilder addUrl(EdgeUrl url) {
|
public MetaTagsBuilder addUrl(EdgeUrl url) {
|
||||||
add("proto", url.proto);
|
add("proto", url.proto);
|
||||||
add("site", url.domain);
|
add("site", url.domain);
|
||||||
add("site", url.domain.domain);
|
add("site", url.domain.topDomain);
|
||||||
add("tld", url.domain.getTld());
|
add("tld", url.domain.getTld());
|
||||||
|
|
||||||
if (url.path.startsWith("/~")) {
|
if (url.path.startsWith("/~")) {
|
||||||
|
@ -291,7 +291,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
for (var fd : lp.getForeignDomains()) {
|
for (var fd : lp.getForeignDomains()) {
|
||||||
linkTerms.add("links:"+fd.toString().toLowerCase());
|
linkTerms.add("links:"+fd.toString().toLowerCase());
|
||||||
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
linkTerms.add("links:"+fd.getTopDomain().toLowerCase());
|
||||||
}
|
}
|
||||||
|
|
||||||
return linkTerms;
|
return linkTerms;
|
||||||
|
@ -54,7 +54,7 @@ public class HtmlProcessorSpecializations {
|
|||||||
return blogSpecialization;
|
return blogSpecialization;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (url.domain.getDomain().equals("mariadb.com")
|
if (url.domain.getTopDomain().equals("mariadb.com")
|
||||||
&& url.path.startsWith("/kb")) {
|
&& url.path.startsWith("/kb")) {
|
||||||
return mariadbKbSpecialization;
|
return mariadbKbSpecialization;
|
||||||
}
|
}
|
||||||
|
@ -63,6 +63,11 @@ public class SideloaderProcessing {
|
|||||||
for (String keyword : extraKeywords)
|
for (String keyword : extraKeywords)
|
||||||
ret.words.add(keyword, WordFlags.Subjects.asBit());
|
ret.words.add(keyword, WordFlags.Subjects.asBit());
|
||||||
|
|
||||||
|
if (type == GeneratorType.WIKI)
|
||||||
|
ret.words.add("generator:wiki", WordFlags.Subjects.asBit());
|
||||||
|
else if (type == GeneratorType.DOCS)
|
||||||
|
ret.words.add("generator:docs", WordFlags.Subjects.asBit());
|
||||||
|
|
||||||
ret.details = details.details();
|
ret.details = details.details();
|
||||||
|
|
||||||
// Add a few things that we know about the document
|
// Add a few things that we know about the document
|
||||||
@ -80,8 +85,8 @@ public class SideloaderProcessing {
|
|||||||
PubDate.toYearByte(ret.details.pubYear),
|
PubDate.toYearByte(ret.details.pubYear),
|
||||||
(int) -ret.details.quality,
|
(int) -ret.details.quality,
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki);
|
case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki, DocumentFlags.Sideloaded);
|
||||||
case DOCS -> EnumSet.of(DocumentFlags.GeneratorDocs);
|
case DOCS -> EnumSet.of(DocumentFlags.GeneratorDocs, DocumentFlags.Sideloaded);
|
||||||
default -> EnumSet.noneOf(DocumentFlags.class);
|
default -> EnumSet.noneOf(DocumentFlags.class);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -110,13 +110,18 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
|||||||
String fullUrl = baseUrl.toString() + url;
|
String fullUrl = baseUrl.toString() + url;
|
||||||
|
|
||||||
StringBuilder fullHtml = new StringBuilder();
|
StringBuilder fullHtml = new StringBuilder();
|
||||||
fullHtml.append("<!DOCTYPE html><html><head><title>").append(title).append("</title></head><body>");
|
fullHtml
|
||||||
|
.append("<!DOCTYPE html><html><head><title>")
|
||||||
|
.append(title)
|
||||||
|
.append("</title></head><body>")
|
||||||
|
.append("<div class=\"mw-content-text\">");
|
||||||
|
|
||||||
for (String part : parts) {
|
for (String part : parts) {
|
||||||
fullHtml.append("<p>");
|
fullHtml.append("<p>");
|
||||||
fullHtml.append(part);
|
fullHtml.append(part);
|
||||||
fullHtml.append("</p>");
|
fullHtml.append("</p>");
|
||||||
}
|
}
|
||||||
fullHtml.append("</body></html>");
|
fullHtml.append("</div></body></html>");
|
||||||
|
|
||||||
var doc = sideloaderProcessing
|
var doc = sideloaderProcessing
|
||||||
.processDocument(fullUrl,
|
.processDocument(fullUrl,
|
||||||
|
@ -115,8 +115,9 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
ret.words = keywordExtractor.extractKeywords(dld, url);
|
ret.words = keywordExtractor.extractKeywords(dld, url);
|
||||||
ret.words.addAllSyntheticTerms(List.of(
|
ret.words.addAllSyntheticTerms(List.of(
|
||||||
"site:" + domainName,
|
"site:" + domainName,
|
||||||
"site:" + url.domain.domain,
|
"site:" + url.domain.topDomain,
|
||||||
url.domain.domain
|
url.domain.topDomain,
|
||||||
|
domainName
|
||||||
));
|
));
|
||||||
|
|
||||||
if (!post.tags().isBlank()) {
|
if (!post.tags().isBlank()) {
|
||||||
|
@ -33,7 +33,7 @@ class RssCrawlerTest {
|
|||||||
var href = element.attr("href");
|
var href = element.attr("href");
|
||||||
if (href != null && !href.isBlank()) {
|
if (href != null && !href.isBlank()) {
|
||||||
lp.parseLink(base, href)
|
lp.parseLink(base, href)
|
||||||
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
|
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
||||||
.ifPresent(urls::add);
|
.ifPresent(urls::add);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -42,7 +42,7 @@ class RssCrawlerTest {
|
|||||||
var href = element.text();
|
var href = element.text();
|
||||||
if (href != null && !href.isBlank()) {
|
if (href != null && !href.isBlank()) {
|
||||||
lp.parseLink(base, href)
|
lp.parseLink(base, href)
|
||||||
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
|
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
||||||
.ifPresent(urls::add);
|
.ifPresent(urls::add);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -51,7 +51,7 @@ class RssCrawlerTest {
|
|||||||
var href = element.text();
|
var href = element.text();
|
||||||
if (href != null && !href.isBlank()) {
|
if (href != null && !href.isBlank()) {
|
||||||
lp.parseLink(base, href)
|
lp.parseLink(base, href)
|
||||||
.filter(u -> Objects.equals(u.domain.domain, base.domain.domain))
|
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
||||||
.ifPresent(urls::add);
|
.ifPresent(urls::add);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -147,7 +147,7 @@ public class DomainLoaderService {
|
|||||||
|
|
||||||
public void accept(EdgeDomain domain) throws SQLException {
|
public void accept(EdgeDomain domain) throws SQLException {
|
||||||
statement.setString(1, domain.toString());
|
statement.setString(1, domain.toString());
|
||||||
statement.setString(2, domain.domain);
|
statement.setString(2, domain.topDomain);
|
||||||
statement.setInt(3, nodeAffinity);
|
statement.setInt(3, nodeAffinity);
|
||||||
statement.addBatch();
|
statement.addBatch();
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ public class SearchQueryParamFactory {
|
|||||||
profile.getSizeLimit(),
|
profile.getSizeLimit(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
List.of(),
|
List.of(),
|
||||||
new QueryLimits(2, 100, 200, 8192),
|
new QueryLimits(1, 100, 200, 8192),
|
||||||
profile.searchSetIdentifier
|
profile.searchSetIdentifier
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -81,7 +81,7 @@ public class ControlBlacklistService {
|
|||||||
""")) {
|
""")) {
|
||||||
stmt.setString(1, domain.toString());
|
stmt.setString(1, domain.toString());
|
||||||
stmt.addBatch();
|
stmt.addBatch();
|
||||||
stmt.setString(1, domain.domain);
|
stmt.setString(1, domain.topDomain);
|
||||||
stmt.addBatch();
|
stmt.addBatch();
|
||||||
stmt.executeBatch();
|
stmt.executeBatch();
|
||||||
}
|
}
|
||||||
|
@ -157,7 +157,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
// This is an artifact of the link parser typically
|
// This is an artifact of the link parser typically
|
||||||
if ("example.com".equals(url.domain.domain))
|
if ("example.com".equals(url.domain.topDomain))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (linkText.contains(url.domain.toString()))
|
if (linkText.contains(url.domain.toString()))
|
||||||
|
@ -61,7 +61,7 @@ public class DomainListRefreshService {
|
|||||||
for (var domain : domainsAll) {
|
for (var domain : domainsAll) {
|
||||||
var parsed = new EdgeDomain(domain);
|
var parsed = new EdgeDomain(domain);
|
||||||
insert.setString(1, domain.toLowerCase());
|
insert.setString(1, domain.toLowerCase());
|
||||||
insert.setString(2, parsed.domain);
|
insert.setString(2, parsed.topDomain);
|
||||||
insert.setInt(3, nodeId);
|
insert.setInt(3, nodeId);
|
||||||
insert.addBatch();
|
insert.addBatch();
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user