Fix bug in redirect handling that caused the crawler to not index some documents.
This commit is contained in:
parent
ef97414edb
commit
ce9abc00dc
@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.*;
|
||||
@ -163,7 +164,15 @@ public class CrawlerRetreiver {
|
||||
var doc = fetchUrl(top);
|
||||
if (doc.isPresent()) {
|
||||
fetchedCount++;
|
||||
crawledDomainWriter.accept(doc.get());
|
||||
|
||||
var d = doc.get();
|
||||
crawledDomainWriter.accept(d);
|
||||
|
||||
try {
|
||||
visited.add(new EdgeUrl(d.url));
|
||||
}
|
||||
catch (URISyntaxException ex) {}
|
||||
|
||||
}
|
||||
|
||||
long crawledTime = System.currentTimeMillis() - startTime;
|
||||
|
@ -198,7 +198,7 @@ public class HttpFetcher {
|
||||
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException {
|
||||
|
||||
var responseUrl = new EdgeUrl(rsp.request().url().toString());
|
||||
if (!responseUrl.equals(url)) {
|
||||
if (!Objects.equals(responseUrl.domain, url.domain)) {
|
||||
return createRedirectResponse(url, rsp, responseUrl);
|
||||
}
|
||||
|
||||
@ -242,7 +242,7 @@ public class HttpFetcher {
|
||||
.timestamp(LocalDateTime.now().toString())
|
||||
.canonicalUrl(canonical)
|
||||
.httpStatus(rsp.code())
|
||||
.url(url.toString())
|
||||
.url(responseUrl.toString())
|
||||
.documentBody(strData)
|
||||
.build();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user