Fix bug in CrawlerRetreiver
... where the root URL wasn't always added properly to the front of the crawl queue.
This commit is contained in:
parent
a6a66c6d8a
commit
fbdedf53de
@ -1,11 +1,12 @@
|
||||
package nu.marginalia.crawling.model.spec;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor @NoArgsConstructor
|
||||
@AllArgsConstructor @NoArgsConstructor @Builder
|
||||
public class CrawlingSpecification {
|
||||
public String id;
|
||||
|
||||
|
@ -77,8 +77,8 @@ public class CrawlerRetreiver {
|
||||
|
||||
// Ensure the index page is always crawled
|
||||
var root = fst.withPathAndParam("/", null);
|
||||
if (crawlFrontier.addKnown(root))
|
||||
crawlFrontier.addFirst(root);
|
||||
|
||||
crawlFrontier.addFirst(root);
|
||||
}
|
||||
else {
|
||||
// We know nothing about this domain, so we'll start with the index, trying both HTTP and HTTPS
|
||||
|
@ -43,11 +43,11 @@ public class DomainCrawlFrontier {
|
||||
public boolean isEmpty() {
|
||||
return queue.isEmpty();
|
||||
}
|
||||
public boolean addKnown(EdgeUrl url) {
|
||||
return known.contains(url.toString());
|
||||
}
|
||||
|
||||
public void addFirst(EdgeUrl url) {
|
||||
queue.addFirst(url);
|
||||
if (known.add(url.toString())) {
|
||||
queue.addFirst(url);
|
||||
}
|
||||
}
|
||||
|
||||
public EdgeUrl takeNextUrl() {
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.crawling.retreival;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
@ -7,42 +8,93 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
@Tag("slow")
|
||||
class CrawlerRetreiverTest {
|
||||
private HttpFetcher httpFetcher;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public static void setUpAll() {
|
||||
// this must be done to avoid java inserting its own user agent for the sitemap requests
|
||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmptySet() throws IOException {
|
||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||
// Tests the case when there are no URLs provided in the crawl set and the
|
||||
// crawler needs to guess the protocol
|
||||
|
||||
var specs = new CrawlingSpecification("1", 5, "www.marginalia.nu", new ArrayList<>());
|
||||
|
||||
HttpFetcher fetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||
public void testWithKnownDomains() {
|
||||
var specs = CrawlingSpecification
|
||||
.builder()
|
||||
.id("whatever")
|
||||
.crawlDepth(5)
|
||||
.domain("www.marginalia.nu")
|
||||
.urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/"))
|
||||
.build();
|
||||
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
new CrawlerRetreiver(fetcher, specs, data::add).fetch();
|
||||
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
|
||||
|
||||
var fetchedUrls =
|
||||
data.stream().filter(CrawledDocument.class::isInstance)
|
||||
.map(CrawledDocument.class::cast)
|
||||
.map(doc -> doc.url)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/"));
|
||||
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
|
||||
|
||||
data.stream().filter(CrawledDocument.class::isInstance)
|
||||
.map(CrawledDocument.class::cast)
|
||||
.forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus));
|
||||
|
||||
/*
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmptySet() {
|
||||
|
||||
var specs = CrawlingSpecification
|
||||
.builder()
|
||||
.id("whatever")
|
||||
.crawlDepth(5)
|
||||
.domain("www.marginalia.nu")
|
||||
.urls(List.of())
|
||||
.build();
|
||||
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
|
||||
|
||||
data.stream().filter(CrawledDocument.class::isInstance)
|
||||
.map(CrawledDocument.class::cast)
|
||||
.forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus));
|
||||
|
||||
var fetchedUrls =
|
||||
data.stream().filter(CrawledDocument.class::isInstance)
|
||||
.map(CrawledDocument.class::cast)
|
||||
.map(doc -> doc.url)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/"));
|
||||
|
||||
Assertions.assertTrue(
|
||||
data.stream().filter(CrawledDocument.class::isInstance)
|
||||
.map(CrawledDocument.class::cast)
|
||||
.filter(doc -> "OK".equals(doc.crawlerStatus))
|
||||
.count() > 1
|
||||
.anyMatch(doc -> "OK".equals(doc.crawlerStatus))
|
||||
);
|
||||
*/
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user