Extracted ranking algorithms to separate directory and made them configurable
This commit is contained in:
parent
1012de3135
commit
ccc5a07081
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import gnu.trove.list.TIntList;
|
import gnu.trove.list.TIntList;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,8 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking;
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import gnu.trove.list.TIntList;
|
import gnu.trove.list.TIntList;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.old;
|
package nu.marginalia.util.ranking.old;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.old;
|
package nu.marginalia.util.ranking.old;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
|
package nu.marginalia.util.ranking.tool;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
@ -1,10 +1,9 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
|
package nu.marginalia.util.ranking.tool;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import gnu.trove.list.TIntList;
|
import gnu.trove.list.TIntList;
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
import gnu.trove.map.hash.TIntDoubleHashMap;
|
|
||||||
import gnu.trove.map.hash.TIntIntHashMap;
|
import gnu.trove.map.hash.TIntIntHashMap;
|
||||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
@ -13,9 +12,9 @@ import it.unimi.dsi.fastutil.ints.IntComparator;
|
|||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.util.ranking.RankingAlgorithm;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.RankingAlgorithm;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
|
package nu.marginalia.util.ranking.tool;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.util.ranking.AcademiaRank;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.AcademiaRank;
|
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
@ -1,9 +1,9 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
|
package nu.marginalia.util.ranking.tool;
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank;
|
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
@ -1,9 +1,9 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.util.ranking.tool;
|
package nu.marginalia.util.ranking.tool;
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BetterReversePageRank;
|
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
@ -10,7 +10,7 @@ import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
|||||||
import nu.marginalia.wmsa.edge.crawler.fetcher.HttpFetcher;
|
import nu.marginalia.wmsa.edge.crawler.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
|
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank;
|
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
@ -6,11 +6,10 @@ import com.google.common.hash.Hashing;
|
|||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
import com.google.gson.GsonBuilder;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BetterReversePageRank;
|
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
@ -24,7 +23,6 @@ import java.sql.Connection;
|
|||||||
import java.sql.ResultSet;
|
import java.sql.ResultSet;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
public class CrawlJobExtractorPageRankMain {
|
public class CrawlJobExtractorPageRankMain {
|
||||||
|
|
||||||
|
@ -1,12 +1,28 @@
|
|||||||
package nu.marginalia.wmsa.edge.index;
|
package nu.marginalia.wmsa.edge.index;
|
||||||
|
|
||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
|
import com.google.inject.Provides;
|
||||||
import com.google.inject.name.Names;
|
import com.google.inject.name.Names;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||||
|
import org.yaml.snakeyaml.Yaml;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
public class EdgeIndexModule extends AbstractModule {
|
public class EdgeIndexModule extends AbstractModule {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void configure() {
|
public void configure() {
|
||||||
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
|
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Provides
|
||||||
|
public RankingSettings rankingSettings() {
|
||||||
|
Path dir = WmsaHome.get().resolve("conf/ranking-settings.yaml");
|
||||||
|
return RankingSettings.from(dir);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,28 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.model;
|
||||||
|
|
||||||
|
import lombok.ToString;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.yaml.snakeyaml.Yaml;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@ToString
|
||||||
|
public class RankingSettings {
|
||||||
|
public List<String> small;
|
||||||
|
public List<String> retro;
|
||||||
|
public List<String> standard;
|
||||||
|
public List<String> academia;
|
||||||
|
|
||||||
|
public static RankingSettings from(Path dir) {
|
||||||
|
try {
|
||||||
|
return new Yaml().loadAs(Files.readString(dir), RankingSettings.class);
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
throw new RuntimeException("Failed to load " + dir, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -7,20 +7,27 @@ import gnu.trove.list.TIntList;
|
|||||||
import gnu.trove.list.array.TIntArrayList;
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||||
|
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
||||||
|
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.*;
|
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class SearchIndexDao {
|
public class SearchIndexDao {
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
|
private RankingSettings rankingSettings;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchIndexDao(HikariDataSource dataSource)
|
public SearchIndexDao(HikariDataSource dataSource,
|
||||||
|
RankingSettings rankingSettings)
|
||||||
{
|
{
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
|
this.rankingSettings = rankingSettings;
|
||||||
|
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@ -71,14 +78,14 @@ public class SearchIndexDao {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public TIntList getDomainsByRealPageRank() {
|
public TIntList getRetroDomains() {
|
||||||
var spr = new BetterStandardPageRank(dataSource,"www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net", "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com");
|
var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new));
|
||||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public TIntList getSmallWebDomains() {
|
public TIntList getSmallWebDomains() {
|
||||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new));
|
||||||
|
|
||||||
rpr.setMaxKnownUrls(750);
|
rpr.setMaxKnownUrls(750);
|
||||||
|
|
||||||
@ -87,13 +94,13 @@ public class SearchIndexDao {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public TIntList getAcademiaDomains() {
|
public TIntList getAcademiaDomains() {
|
||||||
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
|
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new));
|
||||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public TIntList getDomainsByStandardPageRank() {
|
public TIntList getStandardDomains() {
|
||||||
var spr = new BuggyStandardPageRank(dataSource,"memex.marginalia.nu");
|
var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new));
|
||||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,7 +23,7 @@ public class SearchIndexPartitioner {
|
|||||||
|
|
||||||
private SearchEngineRanking retroRanking = null;
|
private SearchEngineRanking retroRanking = null;
|
||||||
private SearchEngineRanking smallWebRanking = null;
|
private SearchEngineRanking smallWebRanking = null;
|
||||||
private SearchEngineRanking prWebRanking = null;
|
private SearchEngineRanking standardRanking = null;
|
||||||
private SearchEngineRanking specialDomainRanking = null;
|
private SearchEngineRanking specialDomainRanking = null;
|
||||||
private SearchEngineRanking academiaRanking = null;
|
private SearchEngineRanking academiaRanking = null;
|
||||||
|
|
||||||
@ -69,16 +69,16 @@ public class SearchIndexPartitioner {
|
|||||||
|
|
||||||
logger.info("Fetching domains");
|
logger.info("Fetching domains");
|
||||||
|
|
||||||
var retroDomains = dao.getDomainsByRealPageRank();
|
var retroDomains = dao.getRetroDomains();
|
||||||
var smallWebDomains = dao.getSmallWebDomains();
|
var smallWebDomains = dao.getSmallWebDomains();
|
||||||
var academiaDomains = dao.getAcademiaDomains();
|
var academiaDomains = dao.getAcademiaDomains();
|
||||||
var prWebDomains = dao.getDomainsByStandardPageRank();
|
var standardDomains = dao.getStandardDomains();
|
||||||
var specialDomains = dao.getSpecialDomains();
|
var specialDomains = dao.getSpecialDomains();
|
||||||
|
|
||||||
logger.info("Got {} retro domains", retroDomains.size());
|
logger.info("Got {} retro domains", retroDomains.size());
|
||||||
logger.info("Got {} small domains", smallWebDomains.size());
|
logger.info("Got {} small domains", smallWebDomains.size());
|
||||||
logger.info("Got {} academia domains", academiaDomains.size());
|
logger.info("Got {} academia domains", academiaDomains.size());
|
||||||
logger.info("Got {} corpo domains", prWebDomains.size());
|
logger.info("Got {} standard domains", standardDomains.size());
|
||||||
logger.info("Got {} special domains", specialDomains.size());
|
logger.info("Got {} special domains", specialDomains.size());
|
||||||
|
|
||||||
var lock = rwl.writeLock();
|
var lock = rwl.writeLock();
|
||||||
@ -87,7 +87,7 @@ public class SearchIndexPartitioner {
|
|||||||
retroRanking = new SearchEngineRanking(0, retroDomains, 0.2, 1);
|
retroRanking = new SearchEngineRanking(0, retroDomains, 0.2, 1);
|
||||||
smallWebRanking = new SearchEngineRanking(2, smallWebDomains, 0.15);
|
smallWebRanking = new SearchEngineRanking(2, smallWebDomains, 0.15);
|
||||||
academiaRanking = new SearchEngineRanking(3, academiaDomains, 1);
|
academiaRanking = new SearchEngineRanking(3, academiaDomains, 1);
|
||||||
prWebRanking = new SearchEngineRanking(4, prWebDomains, 0.2, 1);
|
standardRanking = new SearchEngineRanking(4, standardDomains, 0.2, 1);
|
||||||
specialDomainRanking = new SearchEngineRanking(6, specialDomains, 1);
|
specialDomainRanking = new SearchEngineRanking(6, specialDomains, 1);
|
||||||
logger.info("Finished building partitions table");
|
logger.info("Finished building partitions table");
|
||||||
}
|
}
|
||||||
@ -112,7 +112,7 @@ public class SearchIndexPartitioner {
|
|||||||
return true;
|
return true;
|
||||||
if (academiaRanking.hasBucket(bucketId, domainId))
|
if (academiaRanking.hasBucket(bucketId, domainId))
|
||||||
return true;
|
return true;
|
||||||
if (prWebRanking.hasBucket(bucketId, domainId))
|
if (standardRanking.hasBucket(bucketId, domainId))
|
||||||
return true;
|
return true;
|
||||||
if (specialDomainRanking.hasBucket(bucketId, domainId))
|
if (specialDomainRanking.hasBucket(bucketId, domainId))
|
||||||
return true;
|
return true;
|
||||||
@ -150,8 +150,8 @@ public class SearchIndexPartitioner {
|
|||||||
if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) {
|
if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) {
|
||||||
return academiaRanking.translateId(id);
|
return academiaRanking.translateId(id);
|
||||||
}
|
}
|
||||||
if (prWebRanking != null && prWebRanking.ownsBucket(bucketId)) {
|
if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
|
||||||
return prWebRanking.translateId(id);
|
return standardRanking.translateId(id);
|
||||||
}
|
}
|
||||||
if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) {
|
if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) {
|
||||||
return specialDomainRanking.translateId(id);
|
return specialDomainRanking.translateId(id);
|
||||||
|
@ -6,6 +6,7 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||||
import nu.marginalia.wmsa.edge.index.service.SearchIndexDao;
|
import nu.marginalia.wmsa.edge.index.service.SearchIndexDao;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
@ -58,7 +59,7 @@ public class IndexMergerMain {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var hikari = new DatabaseModule().provideConnection();
|
var hikari = new DatabaseModule().provideConnection();
|
||||||
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari));
|
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings()));
|
||||||
var blacklist = new EdgeDomainBlacklistImpl(hikari);
|
var blacklist = new EdgeDomainBlacklistImpl(hikari);
|
||||||
|
|
||||||
new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist);
|
new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist);
|
||||||
|
@ -9,8 +9,8 @@ import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
|
|||||||
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordRep;
|
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordRep;
|
||||||
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordSpan;
|
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordSpan;
|
||||||
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.tag.WordSeparator;
|
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.tag.WordSeparator;
|
||||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyReversePageRank;
|
import nu.marginalia.util.ranking.BuggyReversePageRank;
|
||||||
import nu.marginalia.wmsa.edge.index.service.util.ranking.BuggyStandardPageRank;
|
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||||
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
@ -0,0 +1,49 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.model;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class RankingSettingsTest {
|
||||||
|
|
||||||
|
Path tempFile;
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() throws IOException {
|
||||||
|
tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
void tearDown() throws IOException {
|
||||||
|
Files.delete(tempFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testParseRankingSettings() throws IOException {
|
||||||
|
Files.writeString(tempFile, """
|
||||||
|
retro:
|
||||||
|
- "www.rep.routledge.com"
|
||||||
|
- "www.personal.kent.edu"
|
||||||
|
small:
|
||||||
|
- "bikobatanari.art"
|
||||||
|
- "wiki.xxiivv.com"
|
||||||
|
academia:
|
||||||
|
- "%edu"
|
||||||
|
standard:
|
||||||
|
- "memex.marginalia.nu"
|
||||||
|
""");
|
||||||
|
|
||||||
|
var settings = RankingSettings.from(tempFile);
|
||||||
|
assertEquals(List.of("www.rep.routledge.com","www.personal.kent.edu"), settings.retro);
|
||||||
|
assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small);
|
||||||
|
assertEquals(List.of("%edu"), settings.academia);
|
||||||
|
assertEquals(List.of("memex.marginalia.nu"), settings.standard);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user