Cleaning the code a bit, fix URL loading bug with multiple fragments in URL

This commit is contained in:
vlofgren 2022-09-02 10:41:02 +02:00
parent 5dd61387bf
commit 3fd48e0e53
3 changed files with 16 additions and 21 deletions

View File

@ -3,7 +3,6 @@ package nu.marginalia.wmsa.edge.converting.processor;
import com.google.common.base.Strings;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
@ -110,22 +109,6 @@ public class DomainProcessor {
}
private double getAverageQuality(List<ProcessedDocument> documents) {
int n = 0;
double q = 0.;
for (var doc : documents) {
if (doc.quality().isPresent()) {
n++;
q += doc.quality().getAsDouble();
}
}
if (n > 0) {
return q / n;
}
return -5.;
}
private EdgeDomainIndexingState getState(String crawlerStatus) {
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
case OK -> EdgeDomainIndexingState.ACTIVE;

View File

@ -41,24 +41,35 @@ public class EdgeUrl implements WideHashable {
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
Here on the Internet, standards are like the picture on the box of the frozen pizza,
and what you get is more like what's on the inside, we try to patch things instead,
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
like bad or missing URLEncoding
*/
public static String urlencodeFixer(String url) throws URISyntaxException {
var s = new StringBuilder();
String goodChars = "&.?:/-;+$#";
String hexChars = "0123456789abcdefABCDEF";
int pathIdx = findPathIdx(url);
if (pathIdx < 0) {
return url;
if (pathIdx < 0) { // url looks like http://marginalia.nu
return url + "/";
}
s.append(url, 0, pathIdx);
for (int i = pathIdx; i < url.length(); i++) {
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
int end = url.indexOf("#");
if (end < 0) end = url.length();
for (int i = pathIdx; i < end; i++) {
int c = url.charAt(i);
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
s.appendCodePoint(c);
}
else if (c == '%' && i+2<url.length()) {
else if (c == '%' && i+2<end) {
int cn = url.charAt(i+1);
int cnn = url.charAt(i+2);
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {

View File

@ -27,6 +27,7 @@ class EdgeUrlTest {
}
@Test
void urlencodeFixer() throws URISyntaxException {
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));