(index) Use new DomainRankingSets to configure ranking algos in index svc

This commit is contained in:
Viktor Lofgren 2024-01-16 12:42:51 +01:00
parent 36ad4c7466
commit e968365858
7 changed files with 102 additions and 226 deletions

View File

@ -6,14 +6,10 @@ import com.google.inject.Singleton;
import com.google.inject.name.Named;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.linkdb.dlinks.DomainLinkDb;
import nu.marginalia.linkdb.dlinks.FileDomainLinkDb;
import nu.marginalia.linkdb.dlinks.SelectingDomainLinkDb;
import nu.marginalia.linkdb.dlinks.SqlDomainLinkDb;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.config.RankingSettings;
import nu.marginalia.WmsaHome;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -30,12 +26,6 @@ public class IndexModule extends AbstractModule {
public void configure() {
}
@Provides
public RankingSettings rankingSettings() {
Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml");
return RankingSettings.from(dir);
}
@Provides
@Singleton
public DomainLinkDb domainLinkDb (

View File

@ -1,26 +0,0 @@
package nu.marginalia.index.config;
import lombok.ToString;
import org.yaml.snakeyaml.Yaml;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
@ToString
public class RankingSettings {
public RankingSettingsEntry small;
public RankingSettingsEntry retro;
public RankingSettingsEntry standard;
public RankingSettingsEntry academia;
public RankingSettingsEntry ranking;
public static RankingSettings from(Path dir) {
try {
return new Yaml().loadAs(Files.readString(dir), RankingSettings.class);
}
catch (IOException ex) {
throw new RuntimeException("Failed to load " + dir, ex);
}
}
}

View File

@ -1,11 +0,0 @@
package nu.marginalia.index.config;
import java.util.List;
public class RankingSettingsEntry {
/** Bias the ranking toward these domains */
public List<String> domains;
/** Number of domains to include in ranking */
public int max;
}

View File

@ -43,13 +43,16 @@ public class IndexOpsService {
if (!run(searchSetService::recalculateAll)) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
public Object reindexEndpoint(Request request, Response response) throws Exception {
if (!run(index::switchIndex).isPresent()) {
Spark.halt(503, "Operations busy");
}
return "OK";
}

View File

@ -4,10 +4,11 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import gnu.trove.list.TIntList;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import lombok.SneakyThrows;
import nu.marginalia.db.DomainRankingSetsService;
import nu.marginalia.db.DomainTypes;
import nu.marginalia.index.IndexServicesFactory;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.ranking.RankingAlgorithm;
import nu.marginalia.ranking.ReversePageRank;
import nu.marginalia.ranking.StandardPageRank;
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
@ -16,31 +17,32 @@ import nu.marginalia.ranking.data.RankingDomainFetcher;
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
import nu.marginalia.index.svc.searchset.RankingSearchSet;
import nu.marginalia.index.svc.searchset.SearchSetAny;
import nu.marginalia.index.config.RankingSettings;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.db.DbUpdateRanks;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.control.ServiceHeartbeat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
@Singleton
public class IndexSearchSetsService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final DomainTypes domainTypes;
private final ServiceHeartbeat heartbeat;
private final IndexServicesFactory indexServicesFactory;
private final ServiceEventLog eventLog;
private final DomainRankingSetsService domainRankingSetsService;
private final DbUpdateRanks dbUpdateRanks;
private final RankingDomainFetcher similarityDomains;
private final RankingSettings rankingSettings;
private final RankingDomainFetcher linksDomains;
private final ConcurrentHashMap<String, SearchSet> rankingSets = new ConcurrentHashMap<>();
// Below are binary indices that are used to constrain a search
private volatile RankingSearchSet popularSet;
private volatile RankingSearchSet smallWebSet;
private volatile RankingSearchSet academiaSet;
private volatile RankingSearchSet blogsSet;
private final SearchSet anySet = new SearchSetAny();
// The ranking value of the domains used in sorting the domains
@ -51,29 +53,36 @@ public class IndexSearchSetsService {
ServiceHeartbeat heartbeat,
RankingDomainFetcher rankingDomains,
RankingDomainFetcherForSimilarityData similarityDomains,
RankingSettings rankingSettings,
IndexServicesFactory servicesFactory,
IndexServicesFactory indexServicesFactory,
ServiceEventLog eventLog,
DomainRankingSetsService domainRankingSetsService,
DbUpdateRanks dbUpdateRanks) throws IOException {
this.domainTypes = domainTypes;
this.heartbeat = heartbeat;
this.indexServicesFactory = indexServicesFactory;
this.eventLog = eventLog;
this.domainRankingSetsService = domainRankingSetsService;
this.dbUpdateRanks = dbUpdateRanks;
if (similarityDomains.hasData()) {
this.similarityDomains = similarityDomains;
this.linksDomains = rankingDomains;
}
else {
// on test environments the cosine similarity graph may not be present
logger.info("Domain similarity is not present, falling back on link graph");
this.similarityDomains = rankingDomains;
this.linksDomains = rankingDomains;
}
this.rankingSettings = rankingSettings;
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, servicesFactory.getSearchSetsBase().resolve("popular.dat"));
blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, servicesFactory.getSearchSetsBase().resolve("blogs.dat"));
for (var rankingSet : domainRankingSetsService.getAll()) {
rankingSets.put(rankingSet.name(),
new RankingSearchSet(rankingSet.name(),
rankingSet.fileName(indexServicesFactory.getSearchSetsBase())
)
);
}
}
public DomainRankings getDomainRankings() {
@ -86,51 +95,79 @@ public class IndexSearchSetsService {
return anySet;
}
return switch (searchSetIdentifier) {
case "POPULAR" -> popularSet;
case "ACADEMIA" -> academiaSet;
case "SMALLWEB" -> smallWebSet;
case "BLOGS" -> blogsSet;
case "NONE", "" -> anySet;
default -> throw new IllegalArgumentException("Unknown search set");
};
if ("NONE".equals(searchSetIdentifier) || "".equals(searchSetIdentifier)) {
return anySet;
}
return Objects.requireNonNull(rankingSets.get(searchSetIdentifier), "Unknown search set");
}
enum RepartitionSteps {
UPDATE_ACADEMIA,
UPDATE_POPULAR,
UPDATE_SMALL_WEB,
UPDATE_BLOGS,
UPDATE_RANKINGS,
FINISHED
}
public void recalculateAll() {
try (var processHeartbeat = heartbeat.createServiceTaskHeartbeat(RepartitionSteps.class, "repartitionAll")) {
processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA);
updateAcademiaDomainsSet();
processHeartbeat.progress(RepartitionSteps.UPDATE_POPULAR);
updatePopularDomainsSet();
processHeartbeat.progress(RepartitionSteps.UPDATE_SMALL_WEB);
updateSmallWebDomainsSet();
processHeartbeat.progress(RepartitionSteps.UPDATE_BLOGS);
updateBlogsSet();
processHeartbeat.progress(RepartitionSteps.UPDATE_RANKINGS);
updateDomainRankings();
processHeartbeat.progress(RepartitionSteps.FINISHED);
for (var rankingSet : domainRankingSetsService.getAll()) {
try {
if (DomainRankingSetsService.DomainSetAlgorithm.SPECIAL.equals(rankingSet.algorithm())) {
switch (rankingSet.name()) {
case "BLOGS" -> recalculateBlogsSet(rankingSet);
case "RANK" -> updateDomainRankings(rankingSet);
case "NONE" -> {}
}
} else {
recalculateNornal(rankingSet);
}
}
catch (Exception ex) {
logger.warn("Failed to recalculate ranking set {}", rankingSet.name(), ex);
}
eventLog.logEvent("RANKING-SET-RECALCULATED", rankingSet.name());
}
}
private void updateDomainRankings() {
var entry = rankingSettings.ranking;
private void recalculateNornal(DomainRankingSetsService.DomainRankingSet rankingSet) {
String[] domains = rankingSet.domains();
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
var ranks = spr.pageRankWithPeripheralNodes(entry.max, () -> new RankingResultHashMapAccumulator(100_000));
RankingAlgorithm rankingAlgorithm = switch (rankingSet.algorithm()) {
case LINKS_PAGERANK -> new StandardPageRank(linksDomains, domains);
case LINKS_CHEIRANK -> new ReversePageRank(linksDomains, domains);
case ADJACENCY_PAGERANK -> new StandardPageRank(similarityDomains, domains);
case ADJACENCY_CHEIRANK -> new ReversePageRank(similarityDomains, domains);
default -> throw new IllegalStateException("Unexpected value: " + rankingSet.algorithm());
};
var data = rankingAlgorithm.pageRankWithPeripheralNodes(rankingSet.depth(), RankingResultHashSetAccumulator::new);
var set = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), data);
rankingSets.put(rankingSet.name(), set);
try {
set.write();
}
catch (IOException ex) {
logger.warn("Failed to write search set", ex);
}
}
private void recalculateBlogsSet(DomainRankingSetsService.DomainRankingSet rankingSet) throws SQLException, IOException {
TIntList knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG);
if (knownDomains.isEmpty()) {
// FIXME: We don't want to reload the entire list every time, but we do want to do it sometimes. Actor maybe?
domainTypes.reloadDomainsList(DomainTypes.Type.BLOG);
knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG);
}
synchronized (this) {
var blogSet = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), new IntOpenHashSet(knownDomains.toArray()));
rankingSets.put(rankingSet.name(), blogSet);
blogSet.write();
}
}
private void updateDomainRankings(DomainRankingSetsService.DomainRankingSet rankingSet) {
var spr = new StandardPageRank(similarityDomains, rankingSet.domains());
var ranks = spr.pageRankWithPeripheralNodes(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
synchronized (this) {
domainRankings = new DomainRankings(ranks);
@ -141,60 +178,4 @@ public class IndexSearchSetsService {
dbUpdateRanks.execute(ranks);
}
@SneakyThrows
public void updatePopularDomainsSet() {
var entry = rankingSettings.retro;
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
synchronized (this) {
popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, popularSet.source, data);
popularSet.write();
}
}
@SneakyThrows
public void updateSmallWebDomainsSet() {
var entry = rankingSettings.small;
var rpr = new ReversePageRank(similarityDomains, entry.domains.toArray(String[]::new));
rpr.setMaxKnownUrls(750);
var data = rpr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
synchronized (this) {
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
smallWebSet.write();
}
}
@SneakyThrows
public void updateBlogsSet() {
TIntList knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG);
if (knownDomains.isEmpty()) {
// FIXME: We don't want to reload the entire list every time, but we do want to do it sometimes. Actor maybe?
domainTypes.reloadDomainsList(DomainTypes.Type.BLOG);
knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG);
}
synchronized (this) {
blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, blogsSet.source, new IntOpenHashSet(knownDomains.toArray()));
blogsSet.write();
}
}
@SneakyThrows
public void updateAcademiaDomainsSet() {
var entry = rankingSettings.academia;
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
synchronized (this) {
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
academiaSet.write();
}
}
}

View File

@ -24,17 +24,17 @@ public class RankingSearchSet implements SearchSet {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final IntOpenHashSet set;
public final SearchSetIdentifier identifier;
public final String name;
public final Path source;
public RankingSearchSet(SearchSetIdentifier identifier, Path source, IntOpenHashSet set) {
this.identifier = identifier;
public RankingSearchSet(String name, Path source, IntOpenHashSet set) {
this.name = name;
this.source = source;
this.set = set;
}
public RankingSearchSet(SearchSetIdentifier identifier, Path source) throws IOException {
this.identifier = identifier;
public RankingSearchSet(String name, Path source) throws IOException {
this.name = name;
this.source = source;
if (!Files.exists(source)) {
@ -45,7 +45,7 @@ public class RankingSearchSet implements SearchSet {
}
if (set.isEmpty()) {
logger.warn("Search set {} is empty", identifier);
logger.warn("Search set {} is empty", name);
}
}
@ -87,6 +87,6 @@ public class RankingSearchSet implements SearchSet {
}
public String toString() {
return identifier.toString();
return name;
}
}

View File

@ -1,61 +0,0 @@
package nu.marginalia.index.model;
import nu.marginalia.index.config.RankingSettings;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
class RankingSettingsTest {
Path tempFile;
@BeforeEach
void setUp() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
}
@AfterEach
void tearDown() throws IOException {
Files.delete(tempFile);
}
@Test
void testParseRankingSettings() throws IOException {
Files.writeString(tempFile, """
retro:
max: 50
domains:
- "www.rep.routledge.com"
- "www.personal.kent.edu"
small:
max: 10
domains:
- "bikobatanari.art"
- "wiki.xxiivv.com"
academia:
max: 101
domains:
- "%edu"
standard:
max: 23
domains:
- "memex.marginalia.nu"
""");
var settings = RankingSettings.from(tempFile);
assertEquals(List.of("www.rep.routledge.com","www.personal.kent.edu"), settings.retro.domains);
assertEquals(50, settings.retro.max);
assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small.domains);
assertEquals(10, settings.small.max);
assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small.domains);
assertEquals(List.of("%edu"), settings.academia.domains);
assertEquals(List.of("memex.marginalia.nu"), settings.standard.domains);
}
}