Merge pull request 'Prepare for new crawl round' (#87) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/87
This commit is contained in:
Viktor Lofgren 2022-08-16 22:53:20 +02:00
commit 5f2258d459
9 changed files with 69 additions and 32 deletions

View File

@ -35,7 +35,7 @@ public class SqlLoadUrls {
IN PATH_HASH BIGINT
)
BEGIN
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PARAM,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
END
""");
}

View File

@ -114,6 +114,7 @@ public class LinkParser {
}
private static final Pattern spaceRegex = Pattern.compile(" ");
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
@SneakyThrows
private String resolveUrl(EdgeUrl baseUrl, String s) {
@ -123,7 +124,7 @@ public class LinkParser {
return s;
}
String[] parts = s.split("\\?", 2);
String[] parts = paramSeparatorPattern.split(s, 2);
String path = parts[0];
String param;
if (parts.length > 1) {

View File

@ -27,24 +27,21 @@ public class QueryParams {
}
public static boolean isPermittedParam(String path, String param) {
if (path.endsWith(".cgi")) return true;
if (param.startsWith("id=")) return true;
if (param.startsWith("p=")) return true;
if (param.startsWith("i=")) return true;
if (param.startsWith("t=")) return true;
if (param.startsWith("v=")) return true;
if (param.startsWith("post=")) return true;
if (path.endsWith("index.php")) {
if (param.startsWith("showtopic="))
return true;
if (param.startsWith("showforum="))
return true;
}
if (path.endsWith("viewtopic.php")) {
return (param.startsWith("t=") || param.startsWith("p="));
}
if (path.endsWith("viewforum.php")) {
return param.startsWith("v=");
}
if (path.endsWith("showthread.php")) {
return (param.startsWith("t=") || param.startsWith("p="));
}
if (path.endsWith("showforum.php")) {
return param.startsWith("v=");
}
if (path.endsWith("StoryView.py")) { // folklore.org is neat
return param.startsWith("project=") || param.startsWith("story=");

View File

@ -45,7 +45,13 @@ public class CrawlJobExtractorMain {
INDEXED DESC,
EC_DOMAIN.ID
""";
private static final String queuedDomainsSql =
"""
SELECT IFNULL(ID, -1), LOWER(CRAWL_QUEUE.DOMAIN_NAME)
FROM CRAWL_QUEUE
LEFT JOIN EC_DOMAIN
ON CRAWL_QUEUE.DOMAIN_NAME=EC_DOMAIN.DOMAIN_NAME
""";
private static final String urlsSql =
"""
SELECT URL
@ -66,8 +72,8 @@ public class CrawlJobExtractorMain {
AND VISITED
;
""";
private static final int MIN_VISIT_COUNT = 100;
private static final int MAX_VISIT_COUNT = 5000;
private static final int MIN_VISIT_COUNT = 1000;
private static final int MAX_VISIT_COUNT = 100000;
private final EdgeDomainBlacklistImpl blacklist;
@ -109,14 +115,25 @@ public class CrawlJobExtractorMain {
}
}
private record DomainWithId(String domainName, int id) {}
private record DomainWithId(String domainName, int id) {
}
private Stream<CrawlingSpecification> extractDomains() {
List<DomainWithId> ids = new ArrayList<>(100_000);
Set<DomainWithId> ids = new HashSet<>(1_000_000);
try (var stmt = conn.prepareStatement(domainsSql)) {
stmt.setFetchSize(10_000);
var rsp = stmt.executeQuery();
try (var stmtDomains = conn.prepareStatement(domainsSql);
var stmtQueue = conn.prepareStatement(queuedDomainsSql);
) {
ResultSet rsp;
stmtDomains.setFetchSize(10_000);
rsp = stmtDomains.executeQuery();
while (rsp.next()) {
ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1)));
}
stmtQueue.setFetchSize(10_000);
rsp = stmtQueue.executeQuery();
while (rsp.next()) {
ids.add(new DomainWithId(rsp.getString(2), rsp.getInt(1)));
}
@ -125,7 +142,6 @@ public class CrawlJobExtractorMain {
ex.printStackTrace();
}
Collections.shuffle(ids);
return ids.stream()
.filter(id -> !blacklist.isBlacklisted(id.id))
.map(this::createCrawlJobForDomain);
@ -140,8 +156,7 @@ public class CrawlJobExtractorMain {
try (var stmt = conn.prepareStatement(urlsSql)) {
stmt.setFetchSize(1000);
stmt.setString(1, domainWithId.domainName);
stmt.setInt(2, domainWithId.id);
stmt.setInt(1, domainWithId.id);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@ -221,7 +236,7 @@ public class CrawlJobExtractorMain {
}
private int calculateCrawlDepthFromVisitedCount(int count) {
count = count + 100 + count / 4;
count = count + 1000 + count / 4;
if (count < MIN_VISIT_COUNT) {
count = MIN_VISIT_COUNT;

View File

@ -35,7 +35,7 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
spamDomainSet = getSpamDomains();
if (oldSetSize == 0) {
if (oldSetSize == 0 && spamDomainSet.size() > 0) {
logger.info("Synchronized {} spam domains", spamDomainSet.size());
}
}

View File

@ -126,7 +126,7 @@ public class EdgeIndexBucket {
query = indexReader.findWord(block, budget, filter, orderedIncludes[0]);
}
int i;
for (i = 1; (i < 2 && i < orderedIncludes.length) || i < orderedIncludes.length-1; i++) {
for (i = 1; (i < 3 && i < orderedIncludes.length) || i < orderedIncludes.length-1; i++) {
query = query.alsoCached(orderedIncludes[i]);
}
for (; i < orderedIncludes.length; i++) {

View File

@ -98,10 +98,25 @@ public class EdgeUrl implements WideHashable {
}
public String toString() {
String portPart = port == null ? "" : (":" + port);
String queryPart = param == null ? "" : ("?" + param);
StringBuilder sb = new StringBuilder(256);
return proto + "://" + domain + portPart + path + queryPart;
sb.append(proto);
sb.append("://");
sb.append(domain);
if (port != null) {
sb.append(':');
sb.append(port);
}
sb.append(path);
if (param != null) {
sb.append('?');
sb.append(param);
}
return sb.toString();
}
public String dir() {

View File

@ -225,4 +225,9 @@ COLLATE utf8mb4_unicode_ci;
CREATE TABLE DATA_DOMAIN_HISTORY (
DOMAIN_NAME VARCHAR(255) PRIMARY KEY,
SCREENSHOT_DATE DATE DEFAULT NOW()
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
CREATE TABLE CRAWL_QUEUE(
DOMAIN_NAME VARCHAR(255) UNIQUE,
SOURCE VARCHAR(255)
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;

View File

@ -10,7 +10,11 @@ class EdgeUrlTest {
public void testHashCode() throws URISyntaxException {
System.out.println(new EdgeUrl("https://memex.marginalia.nu").hashCode());
}
@Test
public void testParam() throws URISyntaxException {
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
}
@Test
void urlencodeFixer() throws URISyntaxException {
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));