Preparations for new crawl round
This commit is contained in:
parent
123675d73b
commit
5cfef610b0
@ -35,7 +35,7 @@ public class SqlLoadUrls {
|
|||||||
IN PATH_HASH BIGINT
|
IN PATH_HASH BIGINT
|
||||||
)
|
)
|
||||||
BEGIN
|
BEGIN
|
||||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PARAM,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
||||||
END
|
END
|
||||||
""");
|
""");
|
||||||
}
|
}
|
||||||
|
@ -114,6 +114,7 @@ public class LinkParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern spaceRegex = Pattern.compile(" ");
|
private static final Pattern spaceRegex = Pattern.compile(" ");
|
||||||
|
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
||||||
@ -123,7 +124,7 @@ public class LinkParser {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
String[] parts = s.split("\\?", 2);
|
String[] parts = paramSeparatorPattern.split(s, 2);
|
||||||
String path = parts[0];
|
String path = parts[0];
|
||||||
String param;
|
String param;
|
||||||
if (parts.length > 1) {
|
if (parts.length > 1) {
|
||||||
|
@ -27,24 +27,21 @@ public class QueryParams {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static boolean isPermittedParam(String path, String param) {
|
public static boolean isPermittedParam(String path, String param) {
|
||||||
|
if (path.endsWith(".cgi")) return true;
|
||||||
|
|
||||||
|
if (param.startsWith("id=")) return true;
|
||||||
|
if (param.startsWith("p=")) return true;
|
||||||
|
if (param.startsWith("i=")) return true;
|
||||||
|
if (param.startsWith("t=")) return true;
|
||||||
|
if (param.startsWith("v=")) return true;
|
||||||
|
if (param.startsWith("post=")) return true;
|
||||||
|
|
||||||
if (path.endsWith("index.php")) {
|
if (path.endsWith("index.php")) {
|
||||||
if (param.startsWith("showtopic="))
|
if (param.startsWith("showtopic="))
|
||||||
return true;
|
return true;
|
||||||
if (param.startsWith("showforum="))
|
if (param.startsWith("showforum="))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (path.endsWith("viewtopic.php")) {
|
|
||||||
return (param.startsWith("t=") || param.startsWith("p="));
|
|
||||||
}
|
|
||||||
if (path.endsWith("viewforum.php")) {
|
|
||||||
return param.startsWith("v=");
|
|
||||||
}
|
|
||||||
if (path.endsWith("showthread.php")) {
|
|
||||||
return (param.startsWith("t=") || param.startsWith("p="));
|
|
||||||
}
|
|
||||||
if (path.endsWith("showforum.php")) {
|
|
||||||
return param.startsWith("v=");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
||||||
return param.startsWith("project=") || param.startsWith("story=");
|
return param.startsWith("project=") || param.startsWith("story=");
|
||||||
|
@ -45,7 +45,13 @@ public class CrawlJobExtractorMain {
|
|||||||
INDEXED DESC,
|
INDEXED DESC,
|
||||||
EC_DOMAIN.ID
|
EC_DOMAIN.ID
|
||||||
""";
|
""";
|
||||||
|
private static final String queuedDomainsSql =
|
||||||
|
"""
|
||||||
|
SELECT IFNULL(ID, -1), LOWER(CRAWL_QUEUE.DOMAIN_NAME)
|
||||||
|
FROM CRAWL_QUEUE
|
||||||
|
LEFT JOIN EC_DOMAIN
|
||||||
|
ON CRAWL_QUEUE.DOMAIN_NAME=EC_DOMAIN.DOMAIN_NAME
|
||||||
|
""";
|
||||||
private static final String urlsSql =
|
private static final String urlsSql =
|
||||||
"""
|
"""
|
||||||
SELECT URL
|
SELECT URL
|
||||||
@ -66,8 +72,8 @@ public class CrawlJobExtractorMain {
|
|||||||
AND VISITED
|
AND VISITED
|
||||||
;
|
;
|
||||||
""";
|
""";
|
||||||
private static final int MIN_VISIT_COUNT = 100;
|
private static final int MIN_VISIT_COUNT = 1000;
|
||||||
private static final int MAX_VISIT_COUNT = 5000;
|
private static final int MAX_VISIT_COUNT = 100000;
|
||||||
|
|
||||||
private final EdgeDomainBlacklistImpl blacklist;
|
private final EdgeDomainBlacklistImpl blacklist;
|
||||||
|
|
||||||
@ -109,14 +115,25 @@ public class CrawlJobExtractorMain {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private record DomainWithId(String domainName, int id) {}
|
private record DomainWithId(String domainName, int id) {
|
||||||
|
}
|
||||||
|
|
||||||
private Stream<CrawlingSpecification> extractDomains() {
|
private Stream<CrawlingSpecification> extractDomains() {
|
||||||
List<DomainWithId> ids = new ArrayList<>(100_000);
|
Set<DomainWithId> ids = new HashSet<>(1_000_000);
|
||||||
|
|
||||||
try (var stmt = conn.prepareStatement(domainsSql)) {
|
try (var stmtDomains = conn.prepareStatement(domainsSql);
|
||||||
stmt.setFetchSize(10_000);
|
var stmtQueue = conn.prepareStatement(queuedDomainsSql);
|
||||||
var rsp = stmt.executeQuery();
|
) {
|
||||||
|
ResultSet rsp;
|
||||||
|
|
||||||
|
stmtDomains.setFetchSize(10_000);
|
||||||
|
rsp = stmtDomains.executeQuery();
|
||||||
|
while (rsp.next()) {
|
||||||
|
ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
stmtQueue.setFetchSize(10_000);
|
||||||
|
rsp = stmtQueue.executeQuery();
|
||||||
while (rsp.next()) {
|
while (rsp.next()) {
|
||||||
ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1)));
|
ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1)));
|
||||||
}
|
}
|
||||||
@ -125,7 +142,6 @@ public class CrawlJobExtractorMain {
|
|||||||
ex.printStackTrace();
|
ex.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
Collections.shuffle(ids);
|
|
||||||
return ids.stream()
|
return ids.stream()
|
||||||
.filter(id -> !blacklist.isBlacklisted(id.id))
|
.filter(id -> !blacklist.isBlacklisted(id.id))
|
||||||
.map(this::createCrawlJobForDomain);
|
.map(this::createCrawlJobForDomain);
|
||||||
@ -140,8 +156,7 @@ public class CrawlJobExtractorMain {
|
|||||||
|
|
||||||
try (var stmt = conn.prepareStatement(urlsSql)) {
|
try (var stmt = conn.prepareStatement(urlsSql)) {
|
||||||
stmt.setFetchSize(1000);
|
stmt.setFetchSize(1000);
|
||||||
stmt.setString(1, domainWithId.domainName);
|
stmt.setInt(1, domainWithId.id);
|
||||||
stmt.setInt(2, domainWithId.id);
|
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
|
|
||||||
while (rsp.next()) {
|
while (rsp.next()) {
|
||||||
@ -221,7 +236,7 @@ public class CrawlJobExtractorMain {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private int calculateCrawlDepthFromVisitedCount(int count) {
|
private int calculateCrawlDepthFromVisitedCount(int count) {
|
||||||
count = count + 100 + count / 4;
|
count = count + 1000 + count / 4;
|
||||||
|
|
||||||
if (count < MIN_VISIT_COUNT) {
|
if (count < MIN_VISIT_COUNT) {
|
||||||
count = MIN_VISIT_COUNT;
|
count = MIN_VISIT_COUNT;
|
||||||
|
@ -98,10 +98,25 @@ public class EdgeUrl implements WideHashable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
String portPart = port == null ? "" : (":" + port);
|
StringBuilder sb = new StringBuilder(256);
|
||||||
String queryPart = param == null ? "" : ("?" + param);
|
|
||||||
|
|
||||||
return proto + "://" + domain + portPart + path + queryPart;
|
sb.append(proto);
|
||||||
|
sb.append("://");
|
||||||
|
sb.append(domain);
|
||||||
|
|
||||||
|
if (port != null) {
|
||||||
|
sb.append(':');
|
||||||
|
sb.append(port);
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.append(path);
|
||||||
|
|
||||||
|
if (param != null) {
|
||||||
|
sb.append('?');
|
||||||
|
sb.append(param);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String dir() {
|
public String dir() {
|
||||||
|
@ -225,4 +225,9 @@ COLLATE utf8mb4_unicode_ci;
|
|||||||
CREATE TABLE DATA_DOMAIN_HISTORY (
|
CREATE TABLE DATA_DOMAIN_HISTORY (
|
||||||
DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
|
DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
|
||||||
SCREENSHOT_DATE DATE DEFAULT NOW()
|
SCREENSHOT_DATE DATE DEFAULT NOW()
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
CREATE TABLE CRAWL_QUEUE(
|
||||||
|
DOMAIN_NAME VARCHAR(255) UNIQUE,
|
||||||
|
SOURCE VARCHAR(255)
|
||||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
@ -10,7 +10,11 @@ class EdgeUrlTest {
|
|||||||
public void testHashCode() throws URISyntaxException {
|
public void testHashCode() throws URISyntaxException {
|
||||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu").hashCode());
|
System.out.println(new EdgeUrl("https://memex.marginalia.nu").hashCode());
|
||||||
}
|
}
|
||||||
|
@Test
|
||||||
|
public void testParam() throws URISyntaxException {
|
||||||
|
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
|
||||||
|
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||||
|
}
|
||||||
@Test
|
@Test
|
||||||
void urlencodeFixer() throws URISyntaxException {
|
void urlencodeFixer() throws URISyntaxException {
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
||||||
|
Loading…
Reference in New Issue
Block a user