Extracted ranking algorithms to separate directory and made them configurable

This commit is contained in:
vlofgren 2022-05-19 19:13:41 +02:00
parent 1012de3135
commit ccc5a07081
22 changed files with 139 additions and 42 deletions

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking; package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList; import gnu.trove.list.TIntList;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking; package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,8 +1,7 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking; package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.array.TIntArrayList;
import java.io.IOException; import java.io.IOException;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking; package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking; package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking; package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList; import gnu.trove.list.TIntList;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.old; package nu.marginalia.util.ranking.old;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.old; package nu.marginalia.util.ranking.old;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool; package nu.marginalia.util.ranking.tool;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Getter; import lombok.Getter;

View File

@ -1,10 +1,9 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool; package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList; import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList; import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntDoubleHashMap;
import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap; import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
@ -13,9 +12,9 @@ import it.unimi.dsi.fastutil.ints.IntComparator;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.ranking.RankingAlgorithm;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.service.util.ranking.RankingAlgorithm;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.mariadb.jdbc.Driver; import org.mariadb.jdbc.Driver;
import org.slf4j.Logger; import org.slf4j.Logger;

View File

@ -1,8 +1,8 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool; package nu.marginalia.util.ranking.tool;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.ranking.AcademiaRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.index.service.util.ranking.AcademiaRank;
import org.mariadb.jdbc.Driver; import org.mariadb.jdbc.Driver;
import java.io.IOException; import java.io.IOException;

View File

@ -1,9 +1,9 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool; package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank;
import org.mariadb.jdbc.Driver; import org.mariadb.jdbc.Driver;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -1,9 +1,9 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool; package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BetterReversePageRank;
import org.mariadb.jdbc.Driver; import org.mariadb.jdbc.Driver;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -10,7 +10,7 @@ import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.crawler.fetcher.HttpFetcher; import nu.marginalia.wmsa.edge.crawler.fetcher.HttpFetcher;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank; import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;

View File

@ -6,11 +6,10 @@ import com.google.common.hash.Hashing;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.gson.GsonBuilder; import com.google.gson.GsonBuilder;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BetterReversePageRank; import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeId;
import org.mariadb.jdbc.Driver; import org.mariadb.jdbc.Driver;
@ -24,7 +23,6 @@ import java.sql.Connection;
import java.sql.ResultSet; import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.*; import java.util.*;
import java.util.stream.Stream;
public class CrawlJobExtractorPageRankMain { public class CrawlJobExtractorPageRankMain {

View File

@ -1,12 +1,28 @@
package nu.marginalia.wmsa.edge.index; package nu.marginalia.wmsa.edge.index;
import com.google.inject.AbstractModule; import com.google.inject.AbstractModule;
import com.google.inject.Provides;
import com.google.inject.name.Names; import com.google.inject.name.Names;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import org.yaml.snakeyaml.Yaml;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class EdgeIndexModule extends AbstractModule { public class EdgeIndexModule extends AbstractModule {
public void configure() { public void configure() {
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31); bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
} }
@Provides
public RankingSettings rankingSettings() {
Path dir = WmsaHome.get().resolve("conf/ranking-settings.yaml");
return RankingSettings.from(dir);
}
} }

View File

@ -0,0 +1,28 @@
package nu.marginalia.wmsa.edge.index.model;
import lombok.ToString;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.yaml.snakeyaml.Yaml;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
@ToString
public class RankingSettings {
public List<String> small;
public List<String> retro;
public List<String> standard;
public List<String> academia;
public static RankingSettings from(Path dir) {
try {
return new Yaml().loadAs(Files.readString(dir), RankingSettings.class);
}
catch (IOException ex) {
throw new RuntimeException("Failed to load " + dir, ex);
}
}
}

View File

@ -7,20 +7,27 @@ import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList; import gnu.trove.list.array.TIntArrayList;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.BetterStandardPageRank;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.index.service.util.ranking.*; import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@Singleton @Singleton
public class SearchIndexDao { public class SearchIndexDao {
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private RankingSettings rankingSettings;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject @Inject
public SearchIndexDao(HikariDataSource dataSource) public SearchIndexDao(HikariDataSource dataSource,
RankingSettings rankingSettings)
{ {
this.dataSource = dataSource; this.dataSource = dataSource;
this.rankingSettings = rankingSettings;
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
} }
@SneakyThrows @SneakyThrows
@ -71,14 +78,14 @@ public class SearchIndexDao {
} }
@SneakyThrows @SneakyThrows
public TIntList getDomainsByRealPageRank() { public TIntList getRetroDomains() {
var spr = new BetterStandardPageRank(dataSource,"www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net", "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com"); var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false); return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
} }
@SneakyThrows @SneakyThrows
public TIntList getSmallWebDomains() { public TIntList getSmallWebDomains() {
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new));
rpr.setMaxKnownUrls(750); rpr.setMaxKnownUrls(750);
@ -87,13 +94,13 @@ public class SearchIndexDao {
@SneakyThrows @SneakyThrows
public TIntList getAcademiaDomains() { public TIntList getAcademiaDomains() {
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu"); var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false); return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
} }
@SneakyThrows @SneakyThrows
public TIntList getDomainsByStandardPageRank() { public TIntList getStandardDomains() {
var spr = new BuggyStandardPageRank(dataSource,"memex.marginalia.nu"); var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false); return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
} }

View File

@ -23,7 +23,7 @@ public class SearchIndexPartitioner {
private SearchEngineRanking retroRanking = null; private SearchEngineRanking retroRanking = null;
private SearchEngineRanking smallWebRanking = null; private SearchEngineRanking smallWebRanking = null;
private SearchEngineRanking prWebRanking = null; private SearchEngineRanking standardRanking = null;
private SearchEngineRanking specialDomainRanking = null; private SearchEngineRanking specialDomainRanking = null;
private SearchEngineRanking academiaRanking = null; private SearchEngineRanking academiaRanking = null;
@ -69,16 +69,16 @@ public class SearchIndexPartitioner {
logger.info("Fetching domains"); logger.info("Fetching domains");
var retroDomains = dao.getDomainsByRealPageRank(); var retroDomains = dao.getRetroDomains();
var smallWebDomains = dao.getSmallWebDomains(); var smallWebDomains = dao.getSmallWebDomains();
var academiaDomains = dao.getAcademiaDomains(); var academiaDomains = dao.getAcademiaDomains();
var prWebDomains = dao.getDomainsByStandardPageRank(); var standardDomains = dao.getStandardDomains();
var specialDomains = dao.getSpecialDomains(); var specialDomains = dao.getSpecialDomains();
logger.info("Got {} retro domains", retroDomains.size()); logger.info("Got {} retro domains", retroDomains.size());
logger.info("Got {} small domains", smallWebDomains.size()); logger.info("Got {} small domains", smallWebDomains.size());
logger.info("Got {} academia domains", academiaDomains.size()); logger.info("Got {} academia domains", academiaDomains.size());
logger.info("Got {} corpo domains", prWebDomains.size()); logger.info("Got {} standard domains", standardDomains.size());
logger.info("Got {} special domains", specialDomains.size()); logger.info("Got {} special domains", specialDomains.size());
var lock = rwl.writeLock(); var lock = rwl.writeLock();
@ -87,7 +87,7 @@ public class SearchIndexPartitioner {
retroRanking = new SearchEngineRanking(0, retroDomains, 0.2, 1); retroRanking = new SearchEngineRanking(0, retroDomains, 0.2, 1);
smallWebRanking = new SearchEngineRanking(2, smallWebDomains, 0.15); smallWebRanking = new SearchEngineRanking(2, smallWebDomains, 0.15);
academiaRanking = new SearchEngineRanking(3, academiaDomains, 1); academiaRanking = new SearchEngineRanking(3, academiaDomains, 1);
prWebRanking = new SearchEngineRanking(4, prWebDomains, 0.2, 1); standardRanking = new SearchEngineRanking(4, standardDomains, 0.2, 1);
specialDomainRanking = new SearchEngineRanking(6, specialDomains, 1); specialDomainRanking = new SearchEngineRanking(6, specialDomains, 1);
logger.info("Finished building partitions table"); logger.info("Finished building partitions table");
} }
@ -112,7 +112,7 @@ public class SearchIndexPartitioner {
return true; return true;
if (academiaRanking.hasBucket(bucketId, domainId)) if (academiaRanking.hasBucket(bucketId, domainId))
return true; return true;
if (prWebRanking.hasBucket(bucketId, domainId)) if (standardRanking.hasBucket(bucketId, domainId))
return true; return true;
if (specialDomainRanking.hasBucket(bucketId, domainId)) if (specialDomainRanking.hasBucket(bucketId, domainId))
return true; return true;
@ -150,8 +150,8 @@ public class SearchIndexPartitioner {
if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) { if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) {
return academiaRanking.translateId(id); return academiaRanking.translateId(id);
} }
if (prWebRanking != null && prWebRanking.ownsBucket(bucketId)) { if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
return prWebRanking.translateId(id); return standardRanking.translateId(id);
} }
if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) { if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) {
return specialDomainRanking.translateId(id); return specialDomainRanking.translateId(id);

View File

@ -6,6 +6,7 @@ import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import nu.marginalia.wmsa.edge.index.service.SearchIndexDao; import nu.marginalia.wmsa.edge.index.service.SearchIndexDao;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
import org.mariadb.jdbc.Driver; import org.mariadb.jdbc.Driver;
@ -58,7 +59,7 @@ public class IndexMergerMain {
} }
var hikari = new DatabaseModule().provideConnection(); var hikari = new DatabaseModule().provideConnection();
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari)); var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings()));
var blacklist = new EdgeDomainBlacklistImpl(hikari); var blacklist = new EdgeDomainBlacklistImpl(hikari);
new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist); new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist);

View File

@ -9,8 +9,8 @@ import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordRep; import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordSpan; import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordSpan;
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.tag.WordSeparator; import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.tag.WordSeparator;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyReversePageRank; import nu.marginalia.util.ranking.BuggyReversePageRank;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank; import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;

View File

@ -0,0 +1,49 @@
package nu.marginalia.wmsa.edge.index.model;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
class RankingSettingsTest {
Path tempFile;
@BeforeEach
void setUp() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
}
@AfterEach
void tearDown() throws IOException {
Files.delete(tempFile);
}
@Test
void testParseRankingSettings() throws IOException {
Files.writeString(tempFile, """
retro:
- "www.rep.routledge.com"
- "www.personal.kent.edu"
small:
- "bikobatanari.art"
- "wiki.xxiivv.com"
academia:
- "%edu"
standard:
- "memex.marginalia.nu"
""");
var settings = RankingSettings.from(tempFile);
assertEquals(List.of("www.rep.routledge.com","www.personal.kent.edu"), settings.retro);
assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small);
assertEquals(List.of("%edu"), settings.academia);
assertEquals(List.of("memex.marginalia.nu"), settings.standard);
}
}