From 6aee27a3f17fffd4dc3790b87a09658a1f2f116e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Dec 2023 16:36:01 +0100 Subject: [PATCH 1/6] (*) Fix bug in EdgeDomain where it would permit domains with a trailing period, DNS style. --- .../model/src/main/java/nu/marginalia/model/EdgeDomain.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java index d60f3571..0686a9ce 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java @@ -23,6 +23,12 @@ public class EdgeDomain implements Serializable { host = host.toLowerCase(); + // Remove trailing dots, which are allowed in DNS but not in URLs + // (though sometimes still show up in the wild) + while (!host.isBlank() && host.endsWith(".")) { + host = host.substring(0, host.length() - 1); + } + var dot = host.lastIndexOf('.'); if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.> From f6fa8bd722a7c309290a1e5caa842ae650484ff0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Dec 2023 16:37:00 +0100 Subject: [PATCH 2/6] (search) Fetch fewer linking and similar domains. Showing a total of 200 connected domains is not very informative. --- .../java/nu/marginalia/search/svc/SearchSiteInfoService.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java index a6e9e381..28c5740d 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -145,10 +145,10 @@ public class SearchSiteInfoService { else { domainInfo = assistantClient.domainInformation(ctx, domainId).blockingFirst(); similarSet = assistantClient - .similarDomains(ctx, domainId, 100) + .similarDomains(ctx, domainId, 25) .blockingFirst(); linkingDomains = assistantClient - .linkedDomains(ctx, domainId, 100) + .linkedDomains(ctx, domainId, 25) .blockingFirst(); } From faa50bf5786f31df2fbbe083e842a61d4e36e325 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 1 Jan 2024 16:19:38 +0100 Subject: [PATCH 3/6] (sideload) Just index based on first paragraph This seems like it would make the wikipedia search result worse, but it drastically improves the result quality! This is because wikipedia has a lot of articles that each talk about a lot of irrelevant concepts, and indexing the entire document means tangentially relevant results tend to displace the most relevant results. --- .../encyclopedia/EncyclopediaMarginaliaNuSideloader.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index f0686b4c..961e1c79 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -120,6 +120,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC fullHtml.append("

"); fullHtml.append(part); fullHtml.append("

"); + break; // Only take the first part, this improves accuracy a lot } fullHtml.append(""); From 9330b5b1d96962b234072a19d2d3db8ff8bb3d0e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 1 Jan 2024 17:16:29 +0100 Subject: [PATCH 4/6] (index) Adjust rank weightings to fix bad wikipedia results There was as bug where if the input of ResultValuator.normalize() was negative, it was truncated to zero. This meant that "bad" results always rank the same. The penalty factor "overallPart" was moved outside of the function and was re-weighted to accomplish a better normalization. Some of the weights were also re-adjusted based on what appears to produce better results. Needs evaluation. --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 3 ++- .../nu/marginalia/index/results/IndexResultDecorator.java | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 30b647e9..390a02b8 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -108,7 +108,8 @@ public class ResultValuator { } } - return normalize(bestTcf + bestBM25F + bestBM25P + bestBM25PN * 0.25 + overallPart); + + return normalize(2* bestTcf + bestBM25F + bestBM25P + bestBM25PN * 0.5) - overallPart / 4; } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java index cf352331..376972b8 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java @@ -10,6 +10,8 @@ import nu.marginalia.index.client.model.results.SearchResultItem; import nu.marginalia.linkdb.LinkdbReader; import nu.marginalia.linkdb.model.LdbUrlDetail; import nu.marginalia.ranking.ResultValuator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.ArrayList; @@ -21,6 +23,8 @@ import java.util.Map; @Singleton public class IndexResultDecorator { + private static final Logger logger = LoggerFactory.getLogger(IndexResultDecorator.class); + private final LinkdbReader linkdbReader; private final ResultValuator valuator; From d2418521a7d0eec8b7fd35246980aa368ea13770 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 1 Jan 2024 18:43:17 +0100 Subject: [PATCH 5/6] (index) Further ranking adjustments --- .../nu/marginalia/ranking/ResultValuator.java | 17 ++++++++++++----- .../marginalia/index/svc/IndexQueryService.java | 2 -- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 390a02b8..2a856258 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -109,7 +109,12 @@ public class ResultValuator { } - return normalize(2* bestTcf + bestBM25F + bestBM25P + bestBM25PN * 0.5) - overallPart / 4; + double overallPartPositive = Math.max(0, overallPart); + double overallPartNegative = -Math.min(0, overallPart); + + // Renormalize to 0...15, where 0 is the best possible score; + // this is a historical artifact of the original ranking function + return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative); } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { @@ -132,11 +137,13 @@ public class ResultValuator { double penalty = 0; boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags); + boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags); + boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags); // Penalize large sites harder for any bullshit as it's a strong signal of a low quality site double largeSiteFactor = 1.; - if (!isForum && size > 400) { + if (!isForum && !isWiki && !isDocs && size > 400) { // Long urls-that-look-like-this tend to be poor search results if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) penalty += 30.0; @@ -156,7 +163,7 @@ public class ResultValuator { if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit())) penalty += 2.5 * largeSiteFactor; - if (isForum) { + if (isForum || isWiki) { penalty = Math.min(0, penalty - 2); } @@ -210,11 +217,11 @@ public class ResultValuator { return 1 + maxSet; } - public static double normalize(double value) { + public static double normalize(double value, double penalty) { if (value < 0) value = 0; - return Math.sqrt((1.0 + scalingFactor) / (1.0 + value)); + return Math.sqrt((1.0 + scalingFactor) / (1.0 + value)) + Math.sqrt(penalty); } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index a912beee..476ea991 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -265,8 +265,6 @@ public class IndexQueryService extends IndexApiImplBase { return new SearchResultSet(resultDecorator.decorateAndRerank(bestResults, rankingContext)); } - /* This is used in result ranking, and is also routed back up the search service in order to recalculate BM-25 - * accurately */ private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, List subqueries) { final var termToId = searchTermsSvc.getAllIncludeTerms(subqueries); final Map termFrequencies = new HashMap<>(termToId.size()); From 9f7df59945ea63d2844609d5d1b676d24f5bbb80 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 2 Jan 2024 12:34:58 +0100 Subject: [PATCH 6/6] (sideload) Reduce quality assessment. This will make these sideloaded results rank much better as there is a pretty harsh penalty for large low-q websites. --- .../nu/marginalia/converting/sideload/SideloaderProcessing.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 3e871f9a..14b35b6a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -80,7 +80,7 @@ public class SideloaderProcessing { ret.details.pubYear = LocalDateTime.now().getYear(); ret.details.features.add(HtmlFeature.JS); ret.details.features.add(HtmlFeature.TRACKING); - ret.details.quality = -10; + ret.details.quality = -4.5; ret.details.generator = type; ret.details.metadata = new DocumentMetadata(3,