diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 802211ce..0c6105e7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.InetAddress; +import java.net.URISyntaxException; import java.net.UnknownHostException; import java.time.LocalDateTime; import java.util.*; @@ -163,7 +164,15 @@ public class CrawlerRetreiver { var doc = fetchUrl(top); if (doc.isPresent()) { fetchedCount++; - crawledDomainWriter.accept(doc.get()); + + var d = doc.get(); + crawledDomainWriter.accept(d); + + try { + visited.add(new EdgeUrl(d.url)); + } + catch (URISyntaxException ex) {} + } long crawledTime = System.currentTimeMillis() - startTime; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index 967e0203..76a2e247 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -198,7 +198,7 @@ public class HttpFetcher { private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException { var responseUrl = new EdgeUrl(rsp.request().url().toString()); - if (!responseUrl.equals(url)) { + if (!Objects.equals(responseUrl.domain, url.domain)) { return createRedirectResponse(url, rsp, responseUrl); } @@ -242,7 +242,7 @@ public class HttpFetcher { .timestamp(LocalDateTime.now().toString()) .canonicalUrl(canonical) .httpStatus(rsp.code()) - .url(url.toString()) + .url(responseUrl.toString()) .documentBody(strData) .build(); }