From e968365858d31a72688150a53dbdc68d23a04fec Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Jan 2024 12:42:51 +0100 Subject: [PATCH] (index) Use new DomainRankingSets to configure ranking algos in index svc --- .../java/nu/marginalia/index/IndexModule.java | 10 - .../index/config/RankingSettings.java | 26 --- .../index/config/RankingSettingsEntry.java | 11 - .../marginalia/index/svc/IndexOpsService.java | 3 + .../index/svc/IndexSearchSetsService.java | 203 ++++++++---------- .../index/svc/searchset/RankingSearchSet.java | 14 +- .../index/model/RankingSettingsTest.java | 61 ------ 7 files changed, 102 insertions(+), 226 deletions(-) delete mode 100644 code/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettings.java delete mode 100644 code/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettingsEntry.java delete mode 100644 code/services-core/index-service/src/test/java/nu/marginalia/index/model/RankingSettingsTest.java diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java index d0b2dcf9..dd5d87b1 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java @@ -6,14 +6,10 @@ import com.google.inject.Singleton; import com.google.inject.name.Named; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.linkdb.dlinks.DomainLinkDb; -import nu.marginalia.linkdb.dlinks.FileDomainLinkDb; import nu.marginalia.linkdb.dlinks.SelectingDomainLinkDb; -import nu.marginalia.linkdb.dlinks.SqlDomainLinkDb; import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.storage.FileStorageService; import nu.marginalia.IndexLocations; -import nu.marginalia.index.config.RankingSettings; -import nu.marginalia.WmsaHome; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,12 +26,6 @@ public class IndexModule extends AbstractModule { public void configure() { } - @Provides - public RankingSettings rankingSettings() { - Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml"); - return RankingSettings.from(dir); - } - @Provides @Singleton public DomainLinkDb domainLinkDb ( diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettings.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettings.java deleted file mode 100644 index a755a480..00000000 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettings.java +++ /dev/null @@ -1,26 +0,0 @@ -package nu.marginalia.index.config; - -import lombok.ToString; -import org.yaml.snakeyaml.Yaml; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -@ToString -public class RankingSettings { - public RankingSettingsEntry small; - public RankingSettingsEntry retro; - public RankingSettingsEntry standard; - public RankingSettingsEntry academia; - public RankingSettingsEntry ranking; - - public static RankingSettings from(Path dir) { - try { - return new Yaml().loadAs(Files.readString(dir), RankingSettings.class); - } - catch (IOException ex) { - throw new RuntimeException("Failed to load " + dir, ex); - } - } -} diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettingsEntry.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettingsEntry.java deleted file mode 100644 index 7723e3ff..00000000 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/config/RankingSettingsEntry.java +++ /dev/null @@ -1,11 +0,0 @@ -package nu.marginalia.index.config; - -import java.util.List; - -public class RankingSettingsEntry { - /** Bias the ranking toward these domains */ - public List domains; - - /** Number of domains to include in ranking */ - public int max; -} diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java index 37d52f2e..4f1830c8 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java @@ -43,13 +43,16 @@ public class IndexOpsService { if (!run(searchSetService::recalculateAll)) { Spark.halt(503, "Operations busy"); } + return "OK"; } public Object reindexEndpoint(Request request, Response response) throws Exception { + if (!run(index::switchIndex).isPresent()) { Spark.halt(503, "Operations busy"); } + return "OK"; } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java index d8f90d34..0d95a087 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java @@ -4,10 +4,11 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import gnu.trove.list.TIntList; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; -import lombok.SneakyThrows; +import nu.marginalia.db.DomainRankingSetsService; import nu.marginalia.db.DomainTypes; import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.searchset.SearchSet; +import nu.marginalia.ranking.RankingAlgorithm; import nu.marginalia.ranking.ReversePageRank; import nu.marginalia.ranking.StandardPageRank; import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator; @@ -16,31 +17,32 @@ import nu.marginalia.ranking.data.RankingDomainFetcher; import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; import nu.marginalia.index.svc.searchset.RankingSearchSet; import nu.marginalia.index.svc.searchset.SearchSetAny; -import nu.marginalia.index.config.RankingSettings; import nu.marginalia.ranking.DomainRankings; -import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.db.DbUpdateRanks; +import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.control.ServiceHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.sql.SQLException; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; @Singleton public class IndexSearchSetsService { private final Logger logger = LoggerFactory.getLogger(getClass()); private final DomainTypes domainTypes; private final ServiceHeartbeat heartbeat; + private final IndexServicesFactory indexServicesFactory; + private final ServiceEventLog eventLog; + private final DomainRankingSetsService domainRankingSetsService; private final DbUpdateRanks dbUpdateRanks; private final RankingDomainFetcher similarityDomains; - private final RankingSettings rankingSettings; - + private final RankingDomainFetcher linksDomains; + private final ConcurrentHashMap rankingSets = new ConcurrentHashMap<>(); // Below are binary indices that are used to constrain a search - private volatile RankingSearchSet popularSet; - private volatile RankingSearchSet smallWebSet; - private volatile RankingSearchSet academiaSet; - private volatile RankingSearchSet blogsSet; private final SearchSet anySet = new SearchSetAny(); // The ranking value of the domains used in sorting the domains @@ -51,29 +53,36 @@ public class IndexSearchSetsService { ServiceHeartbeat heartbeat, RankingDomainFetcher rankingDomains, RankingDomainFetcherForSimilarityData similarityDomains, - RankingSettings rankingSettings, - IndexServicesFactory servicesFactory, + IndexServicesFactory indexServicesFactory, + ServiceEventLog eventLog, + DomainRankingSetsService domainRankingSetsService, DbUpdateRanks dbUpdateRanks) throws IOException { this.domainTypes = domainTypes; this.heartbeat = heartbeat; + this.indexServicesFactory = indexServicesFactory; + this.eventLog = eventLog; + this.domainRankingSetsService = domainRankingSetsService; this.dbUpdateRanks = dbUpdateRanks; if (similarityDomains.hasData()) { this.similarityDomains = similarityDomains; + this.linksDomains = rankingDomains; } else { // on test environments the cosine similarity graph may not be present logger.info("Domain similarity is not present, falling back on link graph"); this.similarityDomains = rankingDomains; + this.linksDomains = rankingDomains; } - this.rankingSettings = rankingSettings; - - smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat")); - academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat")); - popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, servicesFactory.getSearchSetsBase().resolve("popular.dat")); - blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, servicesFactory.getSearchSetsBase().resolve("blogs.dat")); + for (var rankingSet : domainRankingSetsService.getAll()) { + rankingSets.put(rankingSet.name(), + new RankingSearchSet(rankingSet.name(), + rankingSet.fileName(indexServicesFactory.getSearchSetsBase()) + ) + ); + } } public DomainRankings getDomainRankings() { @@ -86,51 +95,79 @@ public class IndexSearchSetsService { return anySet; } - return switch (searchSetIdentifier) { - case "POPULAR" -> popularSet; - case "ACADEMIA" -> academiaSet; - case "SMALLWEB" -> smallWebSet; - case "BLOGS" -> blogsSet; - case "NONE", "" -> anySet; - default -> throw new IllegalArgumentException("Unknown search set"); - }; + if ("NONE".equals(searchSetIdentifier) || "".equals(searchSetIdentifier)) { + return anySet; + } + + return Objects.requireNonNull(rankingSets.get(searchSetIdentifier), "Unknown search set"); } - enum RepartitionSteps { - UPDATE_ACADEMIA, - UPDATE_POPULAR, - UPDATE_SMALL_WEB, - UPDATE_BLOGS, - UPDATE_RANKINGS, - FINISHED - } public void recalculateAll() { - try (var processHeartbeat = heartbeat.createServiceTaskHeartbeat(RepartitionSteps.class, "repartitionAll")) { - - processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA); - updateAcademiaDomainsSet(); - - processHeartbeat.progress(RepartitionSteps.UPDATE_POPULAR); - updatePopularDomainsSet(); - - processHeartbeat.progress(RepartitionSteps.UPDATE_SMALL_WEB); - updateSmallWebDomainsSet(); - - processHeartbeat.progress(RepartitionSteps.UPDATE_BLOGS); - updateBlogsSet(); - - processHeartbeat.progress(RepartitionSteps.UPDATE_RANKINGS); - updateDomainRankings(); - - processHeartbeat.progress(RepartitionSteps.FINISHED); + for (var rankingSet : domainRankingSetsService.getAll()) { + try { + if (DomainRankingSetsService.DomainSetAlgorithm.SPECIAL.equals(rankingSet.algorithm())) { + switch (rankingSet.name()) { + case "BLOGS" -> recalculateBlogsSet(rankingSet); + case "RANK" -> updateDomainRankings(rankingSet); + case "NONE" -> {} + } + } else { + recalculateNornal(rankingSet); + } + } + catch (Exception ex) { + logger.warn("Failed to recalculate ranking set {}", rankingSet.name(), ex); + } + eventLog.logEvent("RANKING-SET-RECALCULATED", rankingSet.name()); } } - private void updateDomainRankings() { - var entry = rankingSettings.ranking; + private void recalculateNornal(DomainRankingSetsService.DomainRankingSet rankingSet) { + String[] domains = rankingSet.domains(); - var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new)); - var ranks = spr.pageRankWithPeripheralNodes(entry.max, () -> new RankingResultHashMapAccumulator(100_000)); + RankingAlgorithm rankingAlgorithm = switch (rankingSet.algorithm()) { + case LINKS_PAGERANK -> new StandardPageRank(linksDomains, domains); + case LINKS_CHEIRANK -> new ReversePageRank(linksDomains, domains); + case ADJACENCY_PAGERANK -> new StandardPageRank(similarityDomains, domains); + case ADJACENCY_CHEIRANK -> new ReversePageRank(similarityDomains, domains); + default -> throw new IllegalStateException("Unexpected value: " + rankingSet.algorithm()); + }; + + var data = rankingAlgorithm.pageRankWithPeripheralNodes(rankingSet.depth(), RankingResultHashSetAccumulator::new); + + var set = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), data); + rankingSets.put(rankingSet.name(), set); + + try { + set.write(); + } + catch (IOException ex) { + logger.warn("Failed to write search set", ex); + } + } + + + + private void recalculateBlogsSet(DomainRankingSetsService.DomainRankingSet rankingSet) throws SQLException, IOException { + TIntList knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG); + + if (knownDomains.isEmpty()) { + // FIXME: We don't want to reload the entire list every time, but we do want to do it sometimes. Actor maybe? + domainTypes.reloadDomainsList(DomainTypes.Type.BLOG); + knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG); + } + + synchronized (this) { + var blogSet = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), new IntOpenHashSet(knownDomains.toArray())); + rankingSets.put(rankingSet.name(), blogSet); + blogSet.write(); + } + } + + private void updateDomainRankings(DomainRankingSetsService.DomainRankingSet rankingSet) { + + var spr = new StandardPageRank(similarityDomains, rankingSet.domains()); + var ranks = spr.pageRankWithPeripheralNodes(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth())); synchronized (this) { domainRankings = new DomainRankings(ranks); @@ -141,60 +178,4 @@ public class IndexSearchSetsService { dbUpdateRanks.execute(ranks); } - @SneakyThrows - public void updatePopularDomainsSet() { - var entry = rankingSettings.retro; - - var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new)); - var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new); - - synchronized (this) { - popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, popularSet.source, data); - popularSet.write(); - } - } - - @SneakyThrows - public void updateSmallWebDomainsSet() { - var entry = rankingSettings.small; - - var rpr = new ReversePageRank(similarityDomains, entry.domains.toArray(String[]::new)); - rpr.setMaxKnownUrls(750); - var data = rpr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new); - - synchronized (this) { - smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data); - smallWebSet.write(); - } - } - - @SneakyThrows - public void updateBlogsSet() { - TIntList knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG); - - if (knownDomains.isEmpty()) { - // FIXME: We don't want to reload the entire list every time, but we do want to do it sometimes. Actor maybe? - domainTypes.reloadDomainsList(DomainTypes.Type.BLOG); - knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG); - } - - synchronized (this) { - blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, blogsSet.source, new IntOpenHashSet(knownDomains.toArray())); - blogsSet.write(); - } - } - - - @SneakyThrows - public void updateAcademiaDomainsSet() { - var entry = rankingSettings.academia; - - var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new)); - var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new); - - synchronized (this) { - academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data); - academiaSet.write(); - } - } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java index d8dd9ca1..89018493 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java @@ -24,17 +24,17 @@ public class RankingSearchSet implements SearchSet { private final Logger logger = LoggerFactory.getLogger(getClass()); private final IntOpenHashSet set; - public final SearchSetIdentifier identifier; + public final String name; public final Path source; - public RankingSearchSet(SearchSetIdentifier identifier, Path source, IntOpenHashSet set) { - this.identifier = identifier; + public RankingSearchSet(String name, Path source, IntOpenHashSet set) { + this.name = name; this.source = source; this.set = set; } - public RankingSearchSet(SearchSetIdentifier identifier, Path source) throws IOException { - this.identifier = identifier; + public RankingSearchSet(String name, Path source) throws IOException { + this.name = name; this.source = source; if (!Files.exists(source)) { @@ -45,7 +45,7 @@ public class RankingSearchSet implements SearchSet { } if (set.isEmpty()) { - logger.warn("Search set {} is empty", identifier); + logger.warn("Search set {} is empty", name); } } @@ -87,6 +87,6 @@ public class RankingSearchSet implements SearchSet { } public String toString() { - return identifier.toString(); + return name; } } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/model/RankingSettingsTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/model/RankingSettingsTest.java deleted file mode 100644 index f49d13d8..00000000 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/model/RankingSettingsTest.java +++ /dev/null @@ -1,61 +0,0 @@ -package nu.marginalia.index.model; - -import nu.marginalia.index.config.RankingSettings; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -class RankingSettingsTest { - - Path tempFile; - @BeforeEach - void setUp() throws IOException { - tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp"); - } - - @AfterEach - void tearDown() throws IOException { - Files.delete(tempFile); - } - - @Test - void testParseRankingSettings() throws IOException { - Files.writeString(tempFile, """ - retro: - max: 50 - domains: - - "www.rep.routledge.com" - - "www.personal.kent.edu" - small: - max: 10 - domains: - - "bikobatanari.art" - - "wiki.xxiivv.com" - academia: - max: 101 - domains: - - "%edu" - standard: - max: 23 - domains: - - "memex.marginalia.nu" - """); - - var settings = RankingSettings.from(tempFile); - assertEquals(List.of("www.rep.routledge.com","www.personal.kent.edu"), settings.retro.domains); - assertEquals(50, settings.retro.max); - assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small.domains); - assertEquals(10, settings.small.max); - assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small.domains); - assertEquals(List.of("%edu"), settings.academia.domains); - assertEquals(List.of("memex.marginalia.nu"), settings.standard.domains); - - } -} \ No newline at end of file