From e963ecb4ae85da69788f961bace4899fc8481291 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 7 Feb 2023 22:13:25 +0100 Subject: [PATCH] Modified the ranking algorithm to be able to pagerank with similarity data instead of the link graph. --- .../edge/index/ranking/RankingAlgorithm.java | 10 ++- .../edge/index/ranking/ReversePageRank.java | 2 + .../edge/index/ranking/StandardPageRank.java | 2 + .../ranking/{ => data}/RankingDomainData.java | 4 +- .../{ => data}/RankingDomainFetcher.java | 23 ++++-- ...RankingDomainFetcherForSimilarityData.java | 63 ++++++++++++++++ .../tool/CreateBrowseDomainRanksTool.java | 71 +++++++++++++++++++ .../index/ranking/tool/PerusePageRankV2.java | 4 +- .../ranking/tool/PrintDomainRanksTool.java | 67 +++++++++++++++++ ...sTool2.java => UpdateDomainRanksTool.java} | 12 ++-- .../index/svc/EdgeIndexSearchSetsService.java | 2 +- 11 files changed, 240 insertions(+), 20 deletions(-) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/{ => data}/RankingDomainData.java (87%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/{ => data}/RankingDomainFetcher.java (84%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/CreateBrowseDomainRanksTool.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PrintDomainRanksTool.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/{UpdateDomainRanksTool2.java => UpdateDomainRanksTool.java} (85%) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java index 94a89c15..2e8589e4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java @@ -5,6 +5,8 @@ import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; import it.unimi.dsi.fastutil.ints.IntArrays; import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -83,6 +85,10 @@ public abstract class RankingAlgorithm { logger.info("Origin Domains: {}", originDomainIds.size()); } + public RankingDomainData getDomainData(int id) { + return domainsById.get(id); + } + public void addPeripheralNodes() { int newNodesIdxCutoff = domainIdToIndex.size(); @@ -200,9 +206,9 @@ public abstract class RankingAlgorithm { public void setMaxKnownUrls(int maxKnownUrls) { this.maxKnownUrls = maxKnownUrls; } - public class RankVector { private final double[] rank; + public RankVector(double defaultValue) { rank = new double[domainIndexToId.size()]; if (defaultValue != 0.) { @@ -259,7 +265,6 @@ public abstract class RankingAlgorithm { return accumulator; } - private static int[] sortOrder(double[] values) { int[] ret = new int[values.length]; @@ -268,6 +273,7 @@ public abstract class RankingAlgorithm { return ret; } + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java index bb51ca77..0c202958 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java @@ -1,6 +1,8 @@ package nu.marginalia.wmsa.edge.index.ranking; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; + public class ReversePageRank extends RankingAlgorithm { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java index 2319f299..d9302fd6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java @@ -1,6 +1,8 @@ package nu.marginalia.wmsa.edge.index.ranking; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; + public class StandardPageRank extends RankingAlgorithm { public StandardPageRank(RankingDomainFetcher domains, String... origins) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainData.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainData.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainData.java index d72da886..4a59daf4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainData.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.ranking; +package nu.marginalia.wmsa.edge.index.ranking.data; import lombok.AllArgsConstructor; import lombok.Data; @@ -10,7 +10,7 @@ public class RankingDomainData { public final int id; public final String name; private int alias; - private EdgeDomainIndexingState state; + public EdgeDomainIndexingState state; public final int knownUrls; public int resolveAlias() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainFetcher.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java index 70be6c15..397d9fb5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingDomainFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.ranking; +package nu.marginalia.wmsa.edge.index.ranking.data; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; @@ -12,11 +12,11 @@ import java.util.function.Consumer; import java.util.function.IntConsumer; public class RankingDomainFetcher { - private final HikariDataSource dataSource; - private final EdgeDomainBlacklistImpl blacklist; - private final Logger logger = LoggerFactory.getLogger(getClass()); + protected final HikariDataSource dataSource; + protected final EdgeDomainBlacklistImpl blacklist; + protected final Logger logger = LoggerFactory.getLogger(getClass()); - private final boolean getNames = false; + protected boolean getNames = false; @Inject public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { @@ -24,6 +24,10 @@ public class RankingDomainFetcher { this.blacklist = blacklist; } + public void retainNames() { + this.getNames = true; + } + public void getDomains(Consumer consumer) { String query; if (getNames) { @@ -49,14 +53,19 @@ public class RankingDomainFetcher { getDomains(query, consumer); } - private void getDomains(String query, Consumer consumer) { + protected void getDomains(String query, Consumer consumer) { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) { stmt.setFetchSize(10000); var rsp = stmt.executeQuery(); while (rsp.next()) { int id = rsp.getInt(1); if (!blacklist.isBlacklisted(id)) { - consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5))); + consumer.accept( + new RankingDomainData(id, + rsp.getString(2), + rsp.getInt(3), + EdgeDomainIndexingState.valueOf(rsp.getString(4)), + rsp.getInt(5))); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java new file mode 100644 index 00000000..dbbede55 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java @@ -0,0 +1,63 @@ +package nu.marginalia.wmsa.edge.index.ranking.data; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; + +import java.sql.SQLException; +import java.util.function.Consumer; + +public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher { + public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { + super(dataSource, blacklist); + } + + public void eachDomainLink(DomainLinkConsumer consumer) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2")) + { + stmt.setFetchSize(10000); + + var rsp = stmt.executeQuery(); + + while (rsp.next()) { + int src = rsp.getInt(1); + int dst = rsp.getInt(2); + + // these "links" are bidi + consumer.accept(src, dst); + consumer.accept(dst, src); + } + } + catch (SQLException ex) { + logger.error("Failed to fetch domain links", ex); + } + } + + public void getDomains(Consumer consumer) { +// String query = +// """ +// SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0) +// FROM EC_DOMAIN +// LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID +// INNER JOIN EC_DOMAIN_LINK ON DEST_DOMAIN_ID=EC_DOMAIN.ID +// WHERE SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID +// GROUP BY EC_DOMAIN.ID +// HAVING COUNT(SOURCE_DOMAIN_ID)>5 +// """; + + String query = + """ + SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0) + FROM EC_DOMAIN + INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID + GROUP BY EC_DOMAIN.ID + """; + + getDomains(query, consumer); + } + + + public void getPeripheralDomains(Consumer consumer) { + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/CreateBrowseDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/CreateBrowseDomainRanksTool.java new file mode 100644 index 00000000..f4cb6197 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/CreateBrowseDomainRanksTool.java @@ -0,0 +1,71 @@ +package nu.marginalia.wmsa.edge.index.ranking.tool; + +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; +import org.mariadb.jdbc.Driver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.concurrent.LinkedBlockingQueue; + +public class CreateBrowseDomainRanksTool { + + private static final Logger logger = LoggerFactory.getLogger(CreateBrowseDomainRanksTool.class); + + + static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); + volatile static boolean running = true; + + @SneakyThrows + public static void main(String... args) { + Driver driver = new Driver(); + var conn = new DatabaseModule().provideConnection(); + + long start = System.currentTimeMillis(); + var uploader = new Thread(() -> uploadThread(conn), "Uploader"); + + logger.info("Ranking"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds)); + var rpr = new StandardPageRank(domains, args); + + uploader.start(); + + var rankData = rpr.pageRankWithPeripheralNodes(1000, RankingResultListAccumulator::new); + + rankData.forEach(i -> { + try { + uploadQueue.put(i); + } catch (InterruptedException e) { + e.printStackTrace(); + } + return true; + }); + + long end = System.currentTimeMillis(); + running = false; + uploader.join(); + + logger.info("Done in {}", (end - start)/1000.0); + } + + public static void uploadThread(HikariDataSource dataSource) { + try (var conn = dataSource.getConnection()) { + try (var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_RANDOM_DOMAINS(DOMAIN_SET, DOMAIN_ID) VALUES (3, ?)")) { + while (running || (!running && !uploadQueue.isEmpty())) { + var job = uploadQueue.take(); + stmt.setInt(1, job); + stmt.executeUpdate(); + } + } + } catch (SQLException | InterruptedException throwables) { + throwables.printStackTrace(); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java index 409a92ad..4fbdd08b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java @@ -11,8 +11,8 @@ import it.unimi.dsi.fastutil.ints.IntComparator; import lombok.AllArgsConstructor; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm; -import nu.marginalia.wmsa.edge.index.ranking.RankingDomainData; -import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; import org.jetbrains.annotations.NotNull; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PrintDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PrintDomainRanksTool.java new file mode 100644 index 00000000..60f12008 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PrintDomainRanksTool.java @@ -0,0 +1,67 @@ +package nu.marginalia.wmsa.edge.index.ranking.tool; + +import lombok.SneakyThrows; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; +import org.mariadb.jdbc.Driver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.atomic.AtomicInteger; + +public class PrintDomainRanksTool { + + private static final Logger logger = LoggerFactory.getLogger(PrintDomainRanksTool.class); + + private volatile static int rankMax; + + static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); + volatile static boolean running = true; + + @SneakyThrows + public static void main(String... args) { + Driver driver = new Driver(); + var conn = new DatabaseModule().provideConnection(); + + long start = System.currentTimeMillis(); + + logger.info("Ranking"); + var ds = new DatabaseModule().provideConnection(); + + RankingDomainFetcher domains; + if (Boolean.getBoolean("use-link-data")) { + domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + domains.retainNames(); + } + else { + domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds)); + domains.retainNames(); + } + + var rpr = new StandardPageRank(domains, args); + + rankMax = rpr.size(); + + var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new); + + AtomicInteger cnt = new AtomicInteger(); + rankData.forEach(i -> { + + var data = rpr.getDomainData(i); + + System.out.printf("%d %s %s\n", cnt.getAndIncrement(), data.name, data.state); + return true; + }); + + long end = System.currentTimeMillis(); + running = false; + + logger.info("Done in {}", (end - start)/1000.0); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool2.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool.java index ccb8c15c..714e3028 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool.java @@ -2,11 +2,11 @@ package nu.marginalia.wmsa.edge.index.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank; -import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -14,9 +14,9 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.concurrent.LinkedBlockingQueue; -public class UpdateDomainRanksTool2 { +public class UpdateDomainRanksTool { - private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class); + private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class); private volatile static int rankMax; @@ -33,8 +33,8 @@ public class UpdateDomainRanksTool2 { logger.info("Ranking"); var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - var rpr = new ReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds)); + var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com"); rankMax = rpr.size(); uploader.start(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java index 072e39ab..68a2a9c8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java @@ -5,7 +5,7 @@ import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank; import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; -import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator; import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator; import nu.marginalia.wmsa.edge.index.IndexServicesFactory;