Preparations for new crawl round

This commit is contained in:
vlofgren 2022-08-16 22:48:16 +02:00
parent 123675d73b
commit 5cfef610b0
7 changed files with 67 additions and 30 deletions

View File

@ -35,7 +35,7 @@ public class SqlLoadUrls {
IN PATH_HASH BIGINT
)
BEGIN
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PARAM,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
END
""");
}

View File

@ -114,6 +114,7 @@ public class LinkParser {
}
private static final Pattern spaceRegex = Pattern.compile(" ");
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
@SneakyThrows
private String resolveUrl(EdgeUrl baseUrl, String s) {
@ -123,7 +124,7 @@ public class LinkParser {
return s;
}
String[] parts = s.split("\\?", 2);
String[] parts = paramSeparatorPattern.split(s, 2);
String path = parts[0];
String param;
if (parts.length > 1) {

View File

@ -27,24 +27,21 @@ public class QueryParams {
}
public static boolean isPermittedParam(String path, String param) {
if (path.endsWith(".cgi")) return true;
if (param.startsWith("id=")) return true;
if (param.startsWith("p=")) return true;
if (param.startsWith("i=")) return true;
if (param.startsWith("t=")) return true;
if (param.startsWith("v=")) return true;
if (param.startsWith("post=")) return true;
if (path.endsWith("index.php")) {
if (param.startsWith("showtopic="))
return true;
if (param.startsWith("showforum="))
return true;
}
if (path.endsWith("viewtopic.php")) {
return (param.startsWith("t=") || param.startsWith("p="));
}
if (path.endsWith("viewforum.php")) {
return param.startsWith("v=");
}
if (path.endsWith("showthread.php")) {
return (param.startsWith("t=") || param.startsWith("p="));
}
if (path.endsWith("showforum.php")) {
return param.startsWith("v=");
}
if (path.endsWith("StoryView.py")) { // folklore.org is neat
return param.startsWith("project=") || param.startsWith("story=");

View File

@ -45,7 +45,13 @@ public class CrawlJobExtractorMain {
INDEXED DESC,
EC_DOMAIN.ID
""";
private static final String queuedDomainsSql =
"""
SELECT IFNULL(ID, -1), LOWER(CRAWL_QUEUE.DOMAIN_NAME)
FROM CRAWL_QUEUE
LEFT JOIN EC_DOMAIN
ON CRAWL_QUEUE.DOMAIN_NAME=EC_DOMAIN.DOMAIN_NAME
""";
private static final String urlsSql =
"""
SELECT URL
@ -66,8 +72,8 @@ public class CrawlJobExtractorMain {
AND VISITED
;
""";
private static final int MIN_VISIT_COUNT = 100;
private static final int MAX_VISIT_COUNT = 5000;
private static final int MIN_VISIT_COUNT = 1000;
private static final int MAX_VISIT_COUNT = 100000;
private final EdgeDomainBlacklistImpl blacklist;
@ -109,14 +115,25 @@ public class CrawlJobExtractorMain {
}
}
private record DomainWithId(String domainName, int id) {}
private record DomainWithId(String domainName, int id) {
}
private Stream<CrawlingSpecification> extractDomains() {
List<DomainWithId> ids = new ArrayList<>(100_000);
Set<DomainWithId> ids = new HashSet<>(1_000_000);
try (var stmt = conn.prepareStatement(domainsSql)) {
stmt.setFetchSize(10_000);
var rsp = stmt.executeQuery();
try (var stmtDomains = conn.prepareStatement(domainsSql);
var stmtQueue = conn.prepareStatement(queuedDomainsSql);
) {
ResultSet rsp;
stmtDomains.setFetchSize(10_000);
rsp = stmtDomains.executeQuery();
while (rsp.next()) {
ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1)));
}
stmtQueue.setFetchSize(10_000);
rsp = stmtQueue.executeQuery();
while (rsp.next()) {
ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1)));
}
@ -125,7 +142,6 @@ public class CrawlJobExtractorMain {
ex.printStackTrace();
}
Collections.shuffle(ids);
return ids.stream()
.filter(id -> !blacklist.isBlacklisted(id.id))
.map(this::createCrawlJobForDomain);
@ -140,8 +156,7 @@ public class CrawlJobExtractorMain {
try (var stmt = conn.prepareStatement(urlsSql)) {
stmt.setFetchSize(1000);
stmt.setString(1, domainWithId.domainName);
stmt.setInt(2, domainWithId.id);
stmt.setInt(1, domainWithId.id);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@ -221,7 +236,7 @@ public class CrawlJobExtractorMain {
}
private int calculateCrawlDepthFromVisitedCount(int count) {
count = count + 100 + count / 4;
count = count + 1000 + count / 4;
if (count < MIN_VISIT_COUNT) {
count = MIN_VISIT_COUNT;

View File

@ -98,10 +98,25 @@ public class EdgeUrl implements WideHashable {
}
public String toString() {
String portPart = port == null ? "" : (":" + port);
String queryPart = param == null ? "" : ("?" + param);
StringBuilder sb = new StringBuilder(256);
return proto + "://" + domain + portPart + path + queryPart;
sb.append(proto);
sb.append("://");
sb.append(domain);
if (port != null) {
sb.append(':');
sb.append(port);
}
sb.append(path);
if (param != null) {
sb.append('?');
sb.append(param);
}
return sb.toString();
}
public String dir() {

View File

@ -225,4 +225,9 @@ COLLATE utf8mb4_unicode_ci;
CREATE TABLE DATA_DOMAIN_HISTORY (
DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
SCREENSHOT_DATE DATE DEFAULT NOW()
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
CREATE TABLE CRAWL_QUEUE(
DOMAIN_NAME VARCHAR(255) UNIQUE,
SOURCE VARCHAR(255)
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;

View File

@ -10,7 +10,11 @@ class EdgeUrlTest {
public void testHashCode() throws URISyntaxException {
System.out.println(new EdgeUrl("https://memex.marginalia.nu").hashCode());
}
@Test
public void testParam() throws URISyntaxException {
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
}
@Test
void urlencodeFixer() throws URISyntaxException {
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));