Fix bug in redirect handling that caused the crawler to not index some documents.

This commit is contained in:
vlofgren 2022-08-17 00:49:29 +02:00
parent ef97414edb
commit ce9abc00dc
2 changed files with 12 additions and 3 deletions

View File

@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.URISyntaxException;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.util.*; import java.util.*;
@ -163,7 +164,15 @@ public class CrawlerRetreiver {
var doc = fetchUrl(top); var doc = fetchUrl(top);
if (doc.isPresent()) { if (doc.isPresent()) {
fetchedCount++; fetchedCount++;
crawledDomainWriter.accept(doc.get());
var d = doc.get();
crawledDomainWriter.accept(d);
try {
visited.add(new EdgeUrl(d.url));
}
catch (URISyntaxException ex) {}
} }
long crawledTime = System.currentTimeMillis() - startTime; long crawledTime = System.currentTimeMillis() - startTime;

View File

@ -198,7 +198,7 @@ public class HttpFetcher {
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException { private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException {
var responseUrl = new EdgeUrl(rsp.request().url().toString()); var responseUrl = new EdgeUrl(rsp.request().url().toString());
if (!responseUrl.equals(url)) { if (!Objects.equals(responseUrl.domain, url.domain)) {
return createRedirectResponse(url, rsp, responseUrl); return createRedirectResponse(url, rsp, responseUrl);
} }
@ -242,7 +242,7 @@ public class HttpFetcher {
.timestamp(LocalDateTime.now().toString()) .timestamp(LocalDateTime.now().toString())
.canonicalUrl(canonical) .canonicalUrl(canonical)
.httpStatus(rsp.code()) .httpStatus(rsp.code())
.url(url.toString()) .url(responseUrl.toString())
.documentBody(strData) .documentBody(strData)
.build(); .build();
} }