From 0b112cb4d4f0fe621553ce3c986682e6d1dacff4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Dec 2023 19:41:37 +0100 Subject: [PATCH] (warc) Update URL encoding in WarcProtocolReconstructor The URI query string is now URL encoded in the WarcProtocolReconstructor. This change ensures proper encoding of special characters as per the standard URL encoding rules and improves URL validity during the crawling process. --- .../crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java index ad29056f..40d98d73 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java @@ -26,7 +26,7 @@ public class WarcProtocolReconstructor { requestStringBuilder.append(request.method()).append(" ").append(encodedURL); if (uri.getQuery() != null) { - requestStringBuilder.append("?").append(uri.getQuery()); + requestStringBuilder.append("?").append(URLEncoder.encode(uri.getQuery(), StandardCharsets.UTF_8)); } requestStringBuilder.append(" HTTP/1.1\r\n"); requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n");