Preparations for new crawl round
This commit is contained in:
parent
123675d73b
commit
5cfef610b0
@ -35,7 +35,7 @@ public class SqlLoadUrls {
|
||||
IN PATH_HASH BIGINT
|
||||
)
|
||||
BEGIN
|
||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PARAM,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
||||
END
|
||||
""");
|
||||
}
|
||||
|
@ -114,6 +114,7 @@ public class LinkParser {
|
||||
}
|
||||
|
||||
private static final Pattern spaceRegex = Pattern.compile(" ");
|
||||
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
|
||||
|
||||
@SneakyThrows
|
||||
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
||||
@ -123,7 +124,7 @@ public class LinkParser {
|
||||
return s;
|
||||
}
|
||||
|
||||
String[] parts = s.split("\\?", 2);
|
||||
String[] parts = paramSeparatorPattern.split(s, 2);
|
||||
String path = parts[0];
|
||||
String param;
|
||||
if (parts.length > 1) {
|
||||
|
@ -27,24 +27,21 @@ public class QueryParams {
|
||||
}
|
||||
|
||||
public static boolean isPermittedParam(String path, String param) {
|
||||
if (path.endsWith(".cgi")) return true;
|
||||
|
||||
if (param.startsWith("id=")) return true;
|
||||
if (param.startsWith("p=")) return true;
|
||||
if (param.startsWith("i=")) return true;
|
||||
if (param.startsWith("t=")) return true;
|
||||
if (param.startsWith("v=")) return true;
|
||||
if (param.startsWith("post=")) return true;
|
||||
|
||||
if (path.endsWith("index.php")) {
|
||||
if (param.startsWith("showtopic="))
|
||||
return true;
|
||||
if (param.startsWith("showforum="))
|
||||
return true;
|
||||
}
|
||||
if (path.endsWith("viewtopic.php")) {
|
||||
return (param.startsWith("t=") || param.startsWith("p="));
|
||||
}
|
||||
if (path.endsWith("viewforum.php")) {
|
||||
return param.startsWith("v=");
|
||||
}
|
||||
if (path.endsWith("showthread.php")) {
|
||||
return (param.startsWith("t=") || param.startsWith("p="));
|
||||
}
|
||||
if (path.endsWith("showforum.php")) {
|
||||
return param.startsWith("v=");
|
||||
}
|
||||
|
||||
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
||||
return param.startsWith("project=") || param.startsWith("story=");
|
||||
|
@ -45,7 +45,13 @@ public class CrawlJobExtractorMain {
|
||||
INDEXED DESC,
|
||||
EC_DOMAIN.ID
|
||||
""";
|
||||
|
||||
private static final String queuedDomainsSql =
|
||||
"""
|
||||
SELECT IFNULL(ID, -1), LOWER(CRAWL_QUEUE.DOMAIN_NAME)
|
||||
FROM CRAWL_QUEUE
|
||||
LEFT JOIN EC_DOMAIN
|
||||
ON CRAWL_QUEUE.DOMAIN_NAME=EC_DOMAIN.DOMAIN_NAME
|
||||
""";
|
||||
private static final String urlsSql =
|
||||
"""
|
||||
SELECT URL
|
||||
@ -66,8 +72,8 @@ public class CrawlJobExtractorMain {
|
||||
AND VISITED
|
||||
;
|
||||
""";
|
||||
private static final int MIN_VISIT_COUNT = 100;
|
||||
private static final int MAX_VISIT_COUNT = 5000;
|
||||
private static final int MIN_VISIT_COUNT = 1000;
|
||||
private static final int MAX_VISIT_COUNT = 100000;
|
||||
|
||||
private final EdgeDomainBlacklistImpl blacklist;
|
||||
|
||||
@ -109,14 +115,25 @@ public class CrawlJobExtractorMain {
|
||||
}
|
||||
}
|
||||
|
||||
private record DomainWithId(String domainName, int id) {}
|
||||
private record DomainWithId(String domainName, int id) {
|
||||
}
|
||||
|
||||
private Stream<CrawlingSpecification> extractDomains() {
|
||||
List<DomainWithId> ids = new ArrayList<>(100_000);
|
||||
Set<DomainWithId> ids = new HashSet<>(1_000_000);
|
||||
|
||||
try (var stmt = conn.prepareStatement(domainsSql)) {
|
||||
stmt.setFetchSize(10_000);
|
||||
var rsp = stmt.executeQuery();
|
||||
try (var stmtDomains = conn.prepareStatement(domainsSql);
|
||||
var stmtQueue = conn.prepareStatement(queuedDomainsSql);
|
||||
) {
|
||||
ResultSet rsp;
|
||||
|
||||
stmtDomains.setFetchSize(10_000);
|
||||
rsp = stmtDomains.executeQuery();
|
||||
while (rsp.next()) {
|
||||
ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1)));
|
||||
}
|
||||
|
||||
stmtQueue.setFetchSize(10_000);
|
||||
rsp = stmtQueue.executeQuery();
|
||||
while (rsp.next()) {
|
||||
ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1)));
|
||||
}
|
||||
@ -125,7 +142,6 @@ public class CrawlJobExtractorMain {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
|
||||
Collections.shuffle(ids);
|
||||
return ids.stream()
|
||||
.filter(id -> !blacklist.isBlacklisted(id.id))
|
||||
.map(this::createCrawlJobForDomain);
|
||||
@ -140,8 +156,7 @@ public class CrawlJobExtractorMain {
|
||||
|
||||
try (var stmt = conn.prepareStatement(urlsSql)) {
|
||||
stmt.setFetchSize(1000);
|
||||
stmt.setString(1, domainWithId.domainName);
|
||||
stmt.setInt(2, domainWithId.id);
|
||||
stmt.setInt(1, domainWithId.id);
|
||||
var rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
@ -221,7 +236,7 @@ public class CrawlJobExtractorMain {
|
||||
}
|
||||
|
||||
private int calculateCrawlDepthFromVisitedCount(int count) {
|
||||
count = count + 100 + count / 4;
|
||||
count = count + 1000 + count / 4;
|
||||
|
||||
if (count < MIN_VISIT_COUNT) {
|
||||
count = MIN_VISIT_COUNT;
|
||||
|
@ -98,10 +98,25 @@ public class EdgeUrl implements WideHashable {
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String portPart = port == null ? "" : (":" + port);
|
||||
String queryPart = param == null ? "" : ("?" + param);
|
||||
StringBuilder sb = new StringBuilder(256);
|
||||
|
||||
return proto + "://" + domain + portPart + path + queryPart;
|
||||
sb.append(proto);
|
||||
sb.append("://");
|
||||
sb.append(domain);
|
||||
|
||||
if (port != null) {
|
||||
sb.append(':');
|
||||
sb.append(port);
|
||||
}
|
||||
|
||||
sb.append(path);
|
||||
|
||||
if (param != null) {
|
||||
sb.append('?');
|
||||
sb.append(param);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String dir() {
|
||||
|
@ -226,3 +226,8 @@ CREATE TABLE DATA_DOMAIN_HISTORY (
|
||||
DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
|
||||
SCREENSHOT_DATE DATE DEFAULT NOW()
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE TABLE CRAWL_QUEUE(
|
||||
DOMAIN_NAME VARCHAR(255) UNIQUE,
|
||||
SOURCE VARCHAR(255)
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
@ -10,7 +10,11 @@ class EdgeUrlTest {
|
||||
public void testHashCode() throws URISyntaxException {
|
||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu").hashCode());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParam() throws URISyntaxException {
|
||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
|
||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||
}
|
||||
@Test
|
||||
void urlencodeFixer() throws URISyntaxException {
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
||||
|
Loading…
Reference in New Issue
Block a user