Cleaning the code a bit, fix URL loading bug with multiple fragments in URL
This commit is contained in:
parent
5dd61387bf
commit
3fd48e0e53
@ -3,7 +3,6 @@ package nu.marginalia.wmsa.edge.converting.processor;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
@ -110,22 +109,6 @@ public class DomainProcessor {
|
||||
|
||||
}
|
||||
|
||||
private double getAverageQuality(List<ProcessedDocument> documents) {
|
||||
int n = 0;
|
||||
double q = 0.;
|
||||
for (var doc : documents) {
|
||||
if (doc.quality().isPresent()) {
|
||||
n++;
|
||||
q += doc.quality().getAsDouble();
|
||||
}
|
||||
}
|
||||
|
||||
if (n > 0) {
|
||||
return q / n;
|
||||
}
|
||||
return -5.;
|
||||
}
|
||||
|
||||
private EdgeDomainIndexingState getState(String crawlerStatus) {
|
||||
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
|
||||
case OK -> EdgeDomainIndexingState.ACTIVE;
|
||||
|
@ -41,24 +41,35 @@ public class EdgeUrl implements WideHashable {
|
||||
|
||||
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
||||
|
||||
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
||||
|
||||
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
||||
and what you get is more like what's on the inside, we try to patch things instead,
|
||||
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
||||
like bad or missing URLEncoding
|
||||
*/
|
||||
public static String urlencodeFixer(String url) throws URISyntaxException {
|
||||
var s = new StringBuilder();
|
||||
String goodChars = "&.?:/-;+$#";
|
||||
String hexChars = "0123456789abcdefABCDEF";
|
||||
|
||||
int pathIdx = findPathIdx(url);
|
||||
if (pathIdx < 0) {
|
||||
return url;
|
||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||
return url + "/";
|
||||
}
|
||||
s.append(url, 0, pathIdx);
|
||||
|
||||
for (int i = pathIdx; i < url.length(); i++) {
|
||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||
int end = url.indexOf("#");
|
||||
if (end < 0) end = url.length();
|
||||
|
||||
for (int i = pathIdx; i < end; i++) {
|
||||
int c = url.charAt(i);
|
||||
|
||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||
s.appendCodePoint(c);
|
||||
}
|
||||
else if (c == '%' && i+2<url.length()) {
|
||||
else if (c == '%' && i+2<end) {
|
||||
int cn = url.charAt(i+1);
|
||||
int cnn = url.charAt(i+2);
|
||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
||||
|
@ -27,6 +27,7 @@ class EdgeUrlTest {
|
||||
}
|
||||
@Test
|
||||
void urlencodeFixer() throws URISyntaxException {
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
|
||||
|
Loading…
Reference in New Issue
Block a user