Preparations for new crawl round

This commit is contained in:
vlofgren 2022-08-16 22:48:16 +02:00
parent 123675d73b
commit 5cfef610b0
7 changed files with 67 additions and 30 deletions

View File

@ -35,7 +35,7 @@ public class SqlLoadUrls {
IN PATH_HASH BIGINT IN PATH_HASH BIGINT
) )
BEGIN BEGIN
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN; INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PARAM,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
END END
"""); """);
} }

View File

@ -114,6 +114,7 @@ public class LinkParser {
} }
private static final Pattern spaceRegex = Pattern.compile(" "); private static final Pattern spaceRegex = Pattern.compile(" ");
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
@SneakyThrows @SneakyThrows
private String resolveUrl(EdgeUrl baseUrl, String s) { private String resolveUrl(EdgeUrl baseUrl, String s) {
@ -123,7 +124,7 @@ public class LinkParser {
return s; return s;
} }
String[] parts = s.split("\\?", 2); String[] parts = paramSeparatorPattern.split(s, 2);
String path = parts[0]; String path = parts[0];
String param; String param;
if (parts.length > 1) { if (parts.length > 1) {

View File

@ -27,24 +27,21 @@ public class QueryParams {
} }
public static boolean isPermittedParam(String path, String param) { public static boolean isPermittedParam(String path, String param) {
if (path.endsWith(".cgi")) return true;
if (param.startsWith("id=")) return true;
if (param.startsWith("p=")) return true;
if (param.startsWith("i=")) return true;
if (param.startsWith("t=")) return true;
if (param.startsWith("v=")) return true;
if (param.startsWith("post=")) return true;
if (path.endsWith("index.php")) { if (path.endsWith("index.php")) {
if (param.startsWith("showtopic=")) if (param.startsWith("showtopic="))
return true; return true;
if (param.startsWith("showforum=")) if (param.startsWith("showforum="))
return true; return true;
} }
if (path.endsWith("viewtopic.php")) {
return (param.startsWith("t=") || param.startsWith("p="));
}
if (path.endsWith("viewforum.php")) {
return param.startsWith("v=");
}
if (path.endsWith("showthread.php")) {
return (param.startsWith("t=") || param.startsWith("p="));
}
if (path.endsWith("showforum.php")) {
return param.startsWith("v=");
}
if (path.endsWith("StoryView.py")) { // folklore.org is neat if (path.endsWith("StoryView.py")) { // folklore.org is neat
return param.startsWith("project=") || param.startsWith("story="); return param.startsWith("project=") || param.startsWith("story=");

View File

@ -45,7 +45,13 @@ public class CrawlJobExtractorMain {
INDEXED DESC, INDEXED DESC,
EC_DOMAIN.ID EC_DOMAIN.ID
"""; """;
private static final String queuedDomainsSql =
"""
SELECT IFNULL(ID, -1), LOWER(CRAWL_QUEUE.DOMAIN_NAME)
FROM CRAWL_QUEUE
LEFT JOIN EC_DOMAIN
ON CRAWL_QUEUE.DOMAIN_NAME=EC_DOMAIN.DOMAIN_NAME
""";
private static final String urlsSql = private static final String urlsSql =
""" """
SELECT URL SELECT URL
@ -66,8 +72,8 @@ public class CrawlJobExtractorMain {
AND VISITED AND VISITED
; ;
"""; """;
private static final int MIN_VISIT_COUNT = 100; private static final int MIN_VISIT_COUNT = 1000;
private static final int MAX_VISIT_COUNT = 5000; private static final int MAX_VISIT_COUNT = 100000;
private final EdgeDomainBlacklistImpl blacklist; private final EdgeDomainBlacklistImpl blacklist;
@ -109,14 +115,25 @@ public class CrawlJobExtractorMain {
} }
} }
private record DomainWithId(String domainName, int id) {} private record DomainWithId(String domainName, int id) {
}
private Stream<CrawlingSpecification> extractDomains() { private Stream<CrawlingSpecification> extractDomains() {
List<DomainWithId> ids = new ArrayList<>(100_000); Set<DomainWithId> ids = new HashSet<>(1_000_000);
try (var stmt = conn.prepareStatement(domainsSql)) { try (var stmtDomains = conn.prepareStatement(domainsSql);
stmt.setFetchSize(10_000); var stmtQueue = conn.prepareStatement(queuedDomainsSql);
var rsp = stmt.executeQuery(); ) {
ResultSet rsp;
stmtDomains.setFetchSize(10_000);
rsp = stmtDomains.executeQuery();
while (rsp.next()) {
ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1)));
}
stmtQueue.setFetchSize(10_000);
rsp = stmtQueue.executeQuery();
while (rsp.next()) { while (rsp.next()) {
ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1))); ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1)));
} }
@ -125,7 +142,6 @@ public class CrawlJobExtractorMain {
ex.printStackTrace(); ex.printStackTrace();
} }
Collections.shuffle(ids);
return ids.stream() return ids.stream()
.filter(id -> !blacklist.isBlacklisted(id.id)) .filter(id -> !blacklist.isBlacklisted(id.id))
.map(this::createCrawlJobForDomain); .map(this::createCrawlJobForDomain);
@ -140,8 +156,7 @@ public class CrawlJobExtractorMain {
try (var stmt = conn.prepareStatement(urlsSql)) { try (var stmt = conn.prepareStatement(urlsSql)) {
stmt.setFetchSize(1000); stmt.setFetchSize(1000);
stmt.setString(1, domainWithId.domainName); stmt.setInt(1, domainWithId.id);
stmt.setInt(2, domainWithId.id);
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
while (rsp.next()) { while (rsp.next()) {
@ -221,7 +236,7 @@ public class CrawlJobExtractorMain {
} }
private int calculateCrawlDepthFromVisitedCount(int count) { private int calculateCrawlDepthFromVisitedCount(int count) {
count = count + 100 + count / 4; count = count + 1000 + count / 4;
if (count < MIN_VISIT_COUNT) { if (count < MIN_VISIT_COUNT) {
count = MIN_VISIT_COUNT; count = MIN_VISIT_COUNT;

View File

@ -98,10 +98,25 @@ public class EdgeUrl implements WideHashable {
} }
public String toString() { public String toString() {
String portPart = port == null ? "" : (":" + port); StringBuilder sb = new StringBuilder(256);
String queryPart = param == null ? "" : ("?" + param);
return proto + "://" + domain + portPart + path + queryPart; sb.append(proto);
sb.append("://");
sb.append(domain);
if (port != null) {
sb.append(':');
sb.append(port);
}
sb.append(path);
if (param != null) {
sb.append('?');
sb.append(param);
}
return sb.toString();
} }
public String dir() { public String dir() {

View File

@ -225,4 +225,9 @@ COLLATE utf8mb4_unicode_ci;
CREATE TABLE DATA_DOMAIN_HISTORY ( CREATE TABLE DATA_DOMAIN_HISTORY (
DOMAIN_NAME VARCHAR(255) PRIMARY KEY, DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
SCREENSHOT_DATE DATE DEFAULT NOW() SCREENSHOT_DATE DATE DEFAULT NOW()
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
CREATE TABLE CRAWL_QUEUE(
DOMAIN_NAME VARCHAR(255) UNIQUE,
SOURCE VARCHAR(255)
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;

View File

@ -10,7 +10,11 @@ class EdgeUrlTest {
public void testHashCode() throws URISyntaxException { public void testHashCode() throws URISyntaxException {
System.out.println(new EdgeUrl("https://memex.marginalia.nu").hashCode()); System.out.println(new EdgeUrl("https://memex.marginalia.nu").hashCode());
} }
@Test
public void testParam() throws URISyntaxException {
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
}
@Test @Test
void urlencodeFixer() throws URISyntaxException { void urlencodeFixer() throws URISyntaxException {
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign")); System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));