Fix bug in CrawlerRetreiver

... where the root URL wasn't always added properly to the front of the crawl queue.
This commit is contained in:
Viktor Lofgren 2023-06-27 15:50:38 +02:00
parent a6a66c6d8a
commit fbdedf53de
4 changed files with 74 additions and 21 deletions

View File

@ -1,11 +1,12 @@
package nu.marginalia.crawling.model.spec;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.NoArgsConstructor;
import java.util.List;
@AllArgsConstructor @NoArgsConstructor
@AllArgsConstructor @NoArgsConstructor @Builder
public class CrawlingSpecification {
public String id;

View File

@ -77,8 +77,8 @@ public class CrawlerRetreiver {
// Ensure the index page is always crawled
var root = fst.withPathAndParam("/", null);
if (crawlFrontier.addKnown(root))
crawlFrontier.addFirst(root);
crawlFrontier.addFirst(root);
}
else {
// We know nothing about this domain, so we'll start with the index, trying both HTTP and HTTPS

View File

@ -43,11 +43,11 @@ public class DomainCrawlFrontier {
public boolean isEmpty() {
return queue.isEmpty();
}
public boolean addKnown(EdgeUrl url) {
return known.contains(url.toString());
}
public void addFirst(EdgeUrl url) {
queue.addFirst(url);
if (known.add(url.toString())) {
queue.addFirst(url);
}
}
public EdgeUrl takeNextUrl() {

View File

@ -1,5 +1,6 @@
package nu.marginalia.crawling.retreival;
import lombok.SneakyThrows;
import nu.marginalia.WmsaHome;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
@ -7,42 +8,93 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.crawling.model.SerializableCrawlData;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.assertTrue;
@Tag("slow")
class CrawlerRetreiverTest {
private HttpFetcher httpFetcher;
@BeforeEach
public void setUp() {
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
}
@SneakyThrows
public static void setUpAll() {
// this must be done to avoid java inserting its own user agent for the sitemap requests
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
}
@Test
public void testEmptySet() throws IOException {
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
// Tests the case when there are no URLs provided in the crawl set and the
// crawler needs to guess the protocol
var specs = new CrawlingSpecification("1", 5, "www.marginalia.nu", new ArrayList<>());
HttpFetcher fetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
public void testWithKnownDomains() {
var specs = CrawlingSpecification
.builder()
.id("whatever")
.crawlDepth(5)
.domain("www.marginalia.nu")
.urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/"))
.build();
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(fetcher, specs, data::add).fetch();
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
var fetchedUrls =
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.map(doc -> doc.url)
.collect(Collectors.toSet());
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/"));
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus));
/*
}
@Test
public void testEmptySet() {
var specs = CrawlingSpecification
.builder()
.id("whatever")
.crawlDepth(5)
.domain("www.marginalia.nu")
.urls(List.of())
.build();
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus));
var fetchedUrls =
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.map(doc -> doc.url)
.collect(Collectors.toSet());
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/"));
Assertions.assertTrue(
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.filter(doc -> "OK".equals(doc.crawlerStatus))
.count() > 1
.anyMatch(doc -> "OK".equals(doc.crawlerStatus))
);
*/
}
}