Extracted ranking algorithms to separate directory and made them configurable

This commit is contained in:
vlofgren 2022-05-19 19:13:41 +02:00
parent 1012de3135
commit ccc5a07081
22 changed files with 139 additions and 42 deletions

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking;
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking;
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,8 +1,7 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking;
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.array.TIntArrayList;
import java.io.IOException;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking;
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking;
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking;
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.old;
package nu.marginalia.util.ranking.old;
import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.old;
package nu.marginalia.util.ranking.old;
import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
package nu.marginalia.util.ranking.tool;
import lombok.AllArgsConstructor;
import lombok.Getter;

View File

@ -1,10 +1,9 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntDoubleHashMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.set.hash.TIntHashSet;
@ -13,9 +12,9 @@ import it.unimi.dsi.fastutil.ints.IntComparator;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.RankingAlgorithm;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.service.util.ranking.RankingAlgorithm;
import org.jetbrains.annotations.NotNull;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;

View File

@ -1,8 +1,8 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
package nu.marginalia.util.ranking.tool;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.AcademiaRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.index.service.util.ranking.AcademiaRank;
import org.mariadb.jdbc.Driver;
import java.io.IOException;

View File

@ -1,9 +1,9 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -1,9 +1,9 @@
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BetterReversePageRank;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -10,7 +10,7 @@ import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.crawler.fetcher.HttpFetcher;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jsoup.Jsoup;

View File

@ -6,11 +6,10 @@ import com.google.common.hash.Hashing;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import org.mariadb.jdbc.Driver;
@ -24,7 +23,6 @@ import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.stream.Stream;
public class CrawlJobExtractorPageRankMain {

View File

@ -1,12 +1,28 @@
package nu.marginalia.wmsa.edge.index;
import com.google.inject.AbstractModule;
import com.google.inject.Provides;
import com.google.inject.name.Names;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import org.yaml.snakeyaml.Yaml;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class EdgeIndexModule extends AbstractModule {
public void configure() {
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
}
@Provides
public RankingSettings rankingSettings() {
Path dir = WmsaHome.get().resolve("conf/ranking-settings.yaml");
return RankingSettings.from(dir);
}
}

View File

@ -0,0 +1,28 @@
package nu.marginalia.wmsa.edge.index.model;
import lombok.ToString;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.yaml.snakeyaml.Yaml;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
@ToString
public class RankingSettings {
public List<String> small;
public List<String> retro;
public List<String> standard;
public List<String> academia;
public static RankingSettings from(Path dir) {
try {
return new Yaml().loadAs(Files.readString(dir), RankingSettings.class);
}
catch (IOException ex) {
throw new RuntimeException("Failed to load " + dir, ex);
}
}
}

View File

@ -7,20 +7,27 @@ import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.BetterStandardPageRank;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.index.service.util.ranking.*;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Singleton
public class SearchIndexDao {
private final HikariDataSource dataSource;
private RankingSettings rankingSettings;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public SearchIndexDao(HikariDataSource dataSource)
public SearchIndexDao(HikariDataSource dataSource,
RankingSettings rankingSettings)
{
this.dataSource = dataSource;
this.rankingSettings = rankingSettings;
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
}
@SneakyThrows
@ -71,14 +78,14 @@ public class SearchIndexDao {
}
@SneakyThrows
public TIntList getDomainsByRealPageRank() {
var spr = new BetterStandardPageRank(dataSource,"www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net", "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com");
public TIntList getRetroDomains() {
var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
}
@SneakyThrows
public TIntList getSmallWebDomains() {
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new));
rpr.setMaxKnownUrls(750);
@ -87,13 +94,13 @@ public class SearchIndexDao {
@SneakyThrows
public TIntList getAcademiaDomains() {
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
}
@SneakyThrows
public TIntList getDomainsByStandardPageRank() {
var spr = new BuggyStandardPageRank(dataSource,"memex.marginalia.nu");
public TIntList getStandardDomains() {
var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
}

View File

@ -23,7 +23,7 @@ public class SearchIndexPartitioner {
private SearchEngineRanking retroRanking = null;
private SearchEngineRanking smallWebRanking = null;
private SearchEngineRanking prWebRanking = null;
private SearchEngineRanking standardRanking = null;
private SearchEngineRanking specialDomainRanking = null;
private SearchEngineRanking academiaRanking = null;
@ -69,16 +69,16 @@ public class SearchIndexPartitioner {
logger.info("Fetching domains");
var retroDomains = dao.getDomainsByRealPageRank();
var retroDomains = dao.getRetroDomains();
var smallWebDomains = dao.getSmallWebDomains();
var academiaDomains = dao.getAcademiaDomains();
var prWebDomains = dao.getDomainsByStandardPageRank();
var standardDomains = dao.getStandardDomains();
var specialDomains = dao.getSpecialDomains();
logger.info("Got {} retro domains", retroDomains.size());
logger.info("Got {} small domains", smallWebDomains.size());
logger.info("Got {} academia domains", academiaDomains.size());
logger.info("Got {} corpo domains", prWebDomains.size());
logger.info("Got {} standard domains", standardDomains.size());
logger.info("Got {} special domains", specialDomains.size());
var lock = rwl.writeLock();
@ -87,7 +87,7 @@ public class SearchIndexPartitioner {
retroRanking = new SearchEngineRanking(0, retroDomains, 0.2, 1);
smallWebRanking = new SearchEngineRanking(2, smallWebDomains, 0.15);
academiaRanking = new SearchEngineRanking(3, academiaDomains, 1);
prWebRanking = new SearchEngineRanking(4, prWebDomains, 0.2, 1);
standardRanking = new SearchEngineRanking(4, standardDomains, 0.2, 1);
specialDomainRanking = new SearchEngineRanking(6, specialDomains, 1);
logger.info("Finished building partitions table");
}
@ -112,7 +112,7 @@ public class SearchIndexPartitioner {
return true;
if (academiaRanking.hasBucket(bucketId, domainId))
return true;
if (prWebRanking.hasBucket(bucketId, domainId))
if (standardRanking.hasBucket(bucketId, domainId))
return true;
if (specialDomainRanking.hasBucket(bucketId, domainId))
return true;
@ -150,8 +150,8 @@ public class SearchIndexPartitioner {
if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) {
return academiaRanking.translateId(id);
}
if (prWebRanking != null && prWebRanking.ownsBucket(bucketId)) {
return prWebRanking.translateId(id);
if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
return standardRanking.translateId(id);
}
if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) {
return specialDomainRanking.translateId(id);

View File

@ -6,6 +6,7 @@ import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import nu.marginalia.wmsa.edge.index.service.SearchIndexDao;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
import org.mariadb.jdbc.Driver;
@ -58,7 +59,7 @@ public class IndexMergerMain {
}
var hikari = new DatabaseModule().provideConnection();
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari));
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings()));
var blacklist = new EdgeDomainBlacklistImpl(hikari);
new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist);

View File

@ -9,8 +9,8 @@ import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordSpan;
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.tag.WordSeparator;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyReversePageRank;
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank;
import nu.marginalia.util.ranking.BuggyReversePageRank;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;

View File

@ -0,0 +1,49 @@
package nu.marginalia.wmsa.edge.index.model;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
class RankingSettingsTest {
Path tempFile;
@BeforeEach
void setUp() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
}
@AfterEach
void tearDown() throws IOException {
Files.delete(tempFile);
}
@Test
void testParseRankingSettings() throws IOException {
Files.writeString(tempFile, """
retro:
- "www.rep.routledge.com"
- "www.personal.kent.edu"
small:
- "bikobatanari.art"
- "wiki.xxiivv.com"
academia:
- "%edu"
standard:
- "memex.marginalia.nu"
""");
var settings = RankingSettings.from(tempFile);
assertEquals(List.of("www.rep.routledge.com","www.personal.kent.edu"), settings.retro);
assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small);
assertEquals(List.of("%edu"), settings.academia);
assertEquals(List.of("memex.marginalia.nu"), settings.standard);
}
}