Extracted ranking algorithms to separate directory and made them configurable
This commit is contained in:
parent
1012de3135
commit
ccc5a07081
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.TIntList;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,8 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.TIntList;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.old;
|
||||
package nu.marginalia.util.ranking.old;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.old;
|
||||
package nu.marginalia.util.ranking.old;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
@ -1,10 +1,9 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TIntDoubleHashMap;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
@ -13,9 +12,9 @@ import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.RankingAlgorithm;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.RankingAlgorithm;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.AcademiaRank;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.AcademiaRank;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
|
||||
import java.io.IOException;
|
@ -1,9 +1,9 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
@ -1,9 +1,9 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BetterReversePageRank;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
@ -10,7 +10,7 @@ import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.crawler.fetcher.HttpFetcher;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
|
@ -6,11 +6,10 @@ import com.google.common.hash.Hashing;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
@ -24,7 +23,6 @@ import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class CrawlJobExtractorPageRankMain {
|
||||
|
||||
|
@ -1,12 +1,28 @@
|
||||
package nu.marginalia.wmsa.edge.index;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Provides;
|
||||
import com.google.inject.name.Names;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||
import org.yaml.snakeyaml.Yaml;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class EdgeIndexModule extends AbstractModule {
|
||||
|
||||
|
||||
|
||||
public void configure() {
|
||||
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
|
||||
}
|
||||
|
||||
@Provides
|
||||
public RankingSettings rankingSettings() {
|
||||
Path dir = WmsaHome.get().resolve("conf/ranking-settings.yaml");
|
||||
return RankingSettings.from(dir);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,28 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
import lombok.ToString;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.yaml.snakeyaml.Yaml;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
@ToString
|
||||
public class RankingSettings {
|
||||
public List<String> small;
|
||||
public List<String> retro;
|
||||
public List<String> standard;
|
||||
public List<String> academia;
|
||||
|
||||
public static RankingSettings from(Path dir) {
|
||||
try {
|
||||
return new Yaml().loadAs(Files.readString(dir), RankingSettings.class);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to load " + dir, ex);
|
||||
}
|
||||
}
|
||||
}
|
@ -7,20 +7,27 @@ import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.*;
|
||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@Singleton
|
||||
public class SearchIndexDao {
|
||||
private final HikariDataSource dataSource;
|
||||
private RankingSettings rankingSettings;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public SearchIndexDao(HikariDataSource dataSource)
|
||||
public SearchIndexDao(HikariDataSource dataSource,
|
||||
RankingSettings rankingSettings)
|
||||
{
|
||||
this.dataSource = dataSource;
|
||||
this.rankingSettings = rankingSettings;
|
||||
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@ -71,14 +78,14 @@ public class SearchIndexDao {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getDomainsByRealPageRank() {
|
||||
var spr = new BetterStandardPageRank(dataSource,"www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net", "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com");
|
||||
public TIntList getRetroDomains() {
|
||||
var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getSmallWebDomains() {
|
||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new));
|
||||
|
||||
rpr.setMaxKnownUrls(750);
|
||||
|
||||
@ -87,13 +94,13 @@ public class SearchIndexDao {
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getAcademiaDomains() {
|
||||
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
|
||||
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getDomainsByStandardPageRank() {
|
||||
var spr = new BuggyStandardPageRank(dataSource,"memex.marginalia.nu");
|
||||
public TIntList getStandardDomains() {
|
||||
var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||
}
|
||||
|
||||
|
@ -23,7 +23,7 @@ public class SearchIndexPartitioner {
|
||||
|
||||
private SearchEngineRanking retroRanking = null;
|
||||
private SearchEngineRanking smallWebRanking = null;
|
||||
private SearchEngineRanking prWebRanking = null;
|
||||
private SearchEngineRanking standardRanking = null;
|
||||
private SearchEngineRanking specialDomainRanking = null;
|
||||
private SearchEngineRanking academiaRanking = null;
|
||||
|
||||
@ -69,16 +69,16 @@ public class SearchIndexPartitioner {
|
||||
|
||||
logger.info("Fetching domains");
|
||||
|
||||
var retroDomains = dao.getDomainsByRealPageRank();
|
||||
var retroDomains = dao.getRetroDomains();
|
||||
var smallWebDomains = dao.getSmallWebDomains();
|
||||
var academiaDomains = dao.getAcademiaDomains();
|
||||
var prWebDomains = dao.getDomainsByStandardPageRank();
|
||||
var standardDomains = dao.getStandardDomains();
|
||||
var specialDomains = dao.getSpecialDomains();
|
||||
|
||||
logger.info("Got {} retro domains", retroDomains.size());
|
||||
logger.info("Got {} small domains", smallWebDomains.size());
|
||||
logger.info("Got {} academia domains", academiaDomains.size());
|
||||
logger.info("Got {} corpo domains", prWebDomains.size());
|
||||
logger.info("Got {} standard domains", standardDomains.size());
|
||||
logger.info("Got {} special domains", specialDomains.size());
|
||||
|
||||
var lock = rwl.writeLock();
|
||||
@ -87,7 +87,7 @@ public class SearchIndexPartitioner {
|
||||
retroRanking = new SearchEngineRanking(0, retroDomains, 0.2, 1);
|
||||
smallWebRanking = new SearchEngineRanking(2, smallWebDomains, 0.15);
|
||||
academiaRanking = new SearchEngineRanking(3, academiaDomains, 1);
|
||||
prWebRanking = new SearchEngineRanking(4, prWebDomains, 0.2, 1);
|
||||
standardRanking = new SearchEngineRanking(4, standardDomains, 0.2, 1);
|
||||
specialDomainRanking = new SearchEngineRanking(6, specialDomains, 1);
|
||||
logger.info("Finished building partitions table");
|
||||
}
|
||||
@ -112,7 +112,7 @@ public class SearchIndexPartitioner {
|
||||
return true;
|
||||
if (academiaRanking.hasBucket(bucketId, domainId))
|
||||
return true;
|
||||
if (prWebRanking.hasBucket(bucketId, domainId))
|
||||
if (standardRanking.hasBucket(bucketId, domainId))
|
||||
return true;
|
||||
if (specialDomainRanking.hasBucket(bucketId, domainId))
|
||||
return true;
|
||||
@ -150,8 +150,8 @@ public class SearchIndexPartitioner {
|
||||
if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) {
|
||||
return academiaRanking.translateId(id);
|
||||
}
|
||||
if (prWebRanking != null && prWebRanking.ownsBucket(bucketId)) {
|
||||
return prWebRanking.translateId(id);
|
||||
if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
|
||||
return standardRanking.translateId(id);
|
||||
}
|
||||
if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) {
|
||||
return specialDomainRanking.translateId(id);
|
||||
|
@ -6,6 +6,7 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||
import nu.marginalia.wmsa.edge.index.service.SearchIndexDao;
|
||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
@ -58,7 +59,7 @@ public class IndexMergerMain {
|
||||
}
|
||||
|
||||
var hikari = new DatabaseModule().provideConnection();
|
||||
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari));
|
||||
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings()));
|
||||
var blacklist = new EdgeDomainBlacklistImpl(hikari);
|
||||
|
||||
new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist);
|
||||
|
@ -9,8 +9,8 @@ import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
|
||||
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordRep;
|
||||
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordSpan;
|
||||
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.tag.WordSeparator;
|
||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyReversePageRank;
|
||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.util.ranking.BuggyReversePageRank;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
@ -0,0 +1,49 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class RankingSettingsTest {
|
||||
|
||||
Path tempFile;
|
||||
@BeforeEach
|
||||
void setUp() throws IOException {
|
||||
tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void tearDown() throws IOException {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testParseRankingSettings() throws IOException {
|
||||
Files.writeString(tempFile, """
|
||||
retro:
|
||||
- "www.rep.routledge.com"
|
||||
- "www.personal.kent.edu"
|
||||
small:
|
||||
- "bikobatanari.art"
|
||||
- "wiki.xxiivv.com"
|
||||
academia:
|
||||
- "%edu"
|
||||
standard:
|
||||
- "memex.marginalia.nu"
|
||||
""");
|
||||
|
||||
var settings = RankingSettings.from(tempFile);
|
||||
assertEquals(List.of("www.rep.routledge.com","www.personal.kent.edu"), settings.retro);
|
||||
assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small);
|
||||
assertEquals(List.of("%edu"), settings.academia);
|
||||
assertEquals(List.of("memex.marginalia.nu"), settings.standard);
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user