Fix bug in redirect handling that caused the crawler to not index some documents.
This commit is contained in:
parent
ef97414edb
commit
ce9abc00dc
@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@ -163,7 +164,15 @@ public class CrawlerRetreiver {
|
|||||||
var doc = fetchUrl(top);
|
var doc = fetchUrl(top);
|
||||||
if (doc.isPresent()) {
|
if (doc.isPresent()) {
|
||||||
fetchedCount++;
|
fetchedCount++;
|
||||||
crawledDomainWriter.accept(doc.get());
|
|
||||||
|
var d = doc.get();
|
||||||
|
crawledDomainWriter.accept(d);
|
||||||
|
|
||||||
|
try {
|
||||||
|
visited.add(new EdgeUrl(d.url));
|
||||||
|
}
|
||||||
|
catch (URISyntaxException ex) {}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
long crawledTime = System.currentTimeMillis() - startTime;
|
long crawledTime = System.currentTimeMillis() - startTime;
|
||||||
|
@ -198,7 +198,7 @@ public class HttpFetcher {
|
|||||||
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException {
|
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException {
|
||||||
|
|
||||||
var responseUrl = new EdgeUrl(rsp.request().url().toString());
|
var responseUrl = new EdgeUrl(rsp.request().url().toString());
|
||||||
if (!responseUrl.equals(url)) {
|
if (!Objects.equals(responseUrl.domain, url.domain)) {
|
||||||
return createRedirectResponse(url, rsp, responseUrl);
|
return createRedirectResponse(url, rsp, responseUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -242,7 +242,7 @@ public class HttpFetcher {
|
|||||||
.timestamp(LocalDateTime.now().toString())
|
.timestamp(LocalDateTime.now().toString())
|
||||||
.canonicalUrl(canonical)
|
.canonicalUrl(canonical)
|
||||||
.httpStatus(rsp.code())
|
.httpStatus(rsp.code())
|
||||||
.url(url.toString())
|
.url(responseUrl.toString())
|
||||||
.documentBody(strData)
|
.documentBody(strData)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user