diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index a3fd2797..f25bd0b9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -35,7 +35,7 @@ public class SqlLoadUrls { IN PATH_HASH BIGINT ) BEGIN - INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN; + INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PARAM,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN; END """); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index 50dc3da6..98be5315 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -114,6 +114,7 @@ public class LinkParser { } private static final Pattern spaceRegex = Pattern.compile(" "); + private static final Pattern paramSeparatorPattern = Pattern.compile("\\?"); @SneakyThrows private String resolveUrl(EdgeUrl baseUrl, String s) { @@ -123,7 +124,7 @@ public class LinkParser { return s; } - String[] parts = s.split("\\?", 2); + String[] parts = paramSeparatorPattern.split(s, 2); String path = parts[0]; String param; if (parts.length > 1) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java index 2e5ef542..7560cdd1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java @@ -27,24 +27,21 @@ public class QueryParams { } public static boolean isPermittedParam(String path, String param) { + if (path.endsWith(".cgi")) return true; + + if (param.startsWith("id=")) return true; + if (param.startsWith("p=")) return true; + if (param.startsWith("i=")) return true; + if (param.startsWith("t=")) return true; + if (param.startsWith("v=")) return true; + if (param.startsWith("post=")) return true; + if (path.endsWith("index.php")) { if (param.startsWith("showtopic=")) return true; if (param.startsWith("showforum=")) return true; } - if (path.endsWith("viewtopic.php")) { - return (param.startsWith("t=") || param.startsWith("p=")); - } - if (path.endsWith("viewforum.php")) { - return param.startsWith("v="); - } - if (path.endsWith("showthread.php")) { - return (param.startsWith("t=") || param.startsWith("p=")); - } - if (path.endsWith("showforum.php")) { - return param.startsWith("v="); - } if (path.endsWith("StoryView.py")) { // folklore.org is neat return param.startsWith("project=") || param.startsWith("story="); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java index 52fe338a..a17983a9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java @@ -45,7 +45,13 @@ public class CrawlJobExtractorMain { INDEXED DESC, EC_DOMAIN.ID """; - + private static final String queuedDomainsSql = + """ + SELECT IFNULL(ID, -1), LOWER(CRAWL_QUEUE.DOMAIN_NAME) + FROM CRAWL_QUEUE + LEFT JOIN EC_DOMAIN + ON CRAWL_QUEUE.DOMAIN_NAME=EC_DOMAIN.DOMAIN_NAME + """; private static final String urlsSql = """ SELECT URL @@ -66,8 +72,8 @@ public class CrawlJobExtractorMain { AND VISITED ; """; - private static final int MIN_VISIT_COUNT = 100; - private static final int MAX_VISIT_COUNT = 5000; + private static final int MIN_VISIT_COUNT = 1000; + private static final int MAX_VISIT_COUNT = 100000; private final EdgeDomainBlacklistImpl blacklist; @@ -109,14 +115,25 @@ public class CrawlJobExtractorMain { } } - private record DomainWithId(String domainName, int id) {} + private record DomainWithId(String domainName, int id) { + } private Stream extractDomains() { - List ids = new ArrayList<>(100_000); + Set ids = new HashSet<>(1_000_000); - try (var stmt = conn.prepareStatement(domainsSql)) { - stmt.setFetchSize(10_000); - var rsp = stmt.executeQuery(); + try (var stmtDomains = conn.prepareStatement(domainsSql); + var stmtQueue = conn.prepareStatement(queuedDomainsSql); + ) { + ResultSet rsp; + + stmtDomains.setFetchSize(10_000); + rsp = stmtDomains.executeQuery(); + while (rsp.next()) { + ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1))); + } + + stmtQueue.setFetchSize(10_000); + rsp = stmtQueue.executeQuery(); while (rsp.next()) { ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1))); } @@ -125,7 +142,6 @@ public class CrawlJobExtractorMain { ex.printStackTrace(); } - Collections.shuffle(ids); return ids.stream() .filter(id -> !blacklist.isBlacklisted(id.id)) .map(this::createCrawlJobForDomain); @@ -140,8 +156,7 @@ public class CrawlJobExtractorMain { try (var stmt = conn.prepareStatement(urlsSql)) { stmt.setFetchSize(1000); - stmt.setString(1, domainWithId.domainName); - stmt.setInt(2, domainWithId.id); + stmt.setInt(1, domainWithId.id); var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -221,7 +236,7 @@ public class CrawlJobExtractorMain { } private int calculateCrawlDepthFromVisitedCount(int count) { - count = count + 100 + count / 4; + count = count + 1000 + count / 4; if (count < MIN_VISIT_COUNT) { count = MIN_VISIT_COUNT; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index 123bd95a..579b6cf4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -98,10 +98,25 @@ public class EdgeUrl implements WideHashable { } public String toString() { - String portPart = port == null ? "" : (":" + port); - String queryPart = param == null ? "" : ("?" + param); + StringBuilder sb = new StringBuilder(256); - return proto + "://" + domain + portPart + path + queryPart; + sb.append(proto); + sb.append("://"); + sb.append(domain); + + if (port != null) { + sb.append(':'); + sb.append(port); + } + + sb.append(path); + + if (param != null) { + sb.append('?'); + sb.append(param); + } + + return sb.toString(); } public String dir() { diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 782bc67d..c1dc9aa9 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -225,4 +225,9 @@ COLLATE utf8mb4_unicode_ci; CREATE TABLE DATA_DOMAIN_HISTORY ( DOMAIN_NAME VARCHAR(255) PRIMARY KEY, SCREENSHOT_DATE DATE DEFAULT NOW() +) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; + +CREATE TABLE CRAWL_QUEUE( + DOMAIN_NAME VARCHAR(255) UNIQUE, + SOURCE VARCHAR(255) ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java index c16f1f08..953fd473 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java @@ -10,7 +10,11 @@ class EdgeUrlTest { public void testHashCode() throws URISyntaxException { System.out.println(new EdgeUrl("https://memex.marginalia.nu").hashCode()); } - + @Test + public void testParam() throws URISyntaxException { + System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString()); + System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString()); + } @Test void urlencodeFixer() throws URISyntaxException { System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));