diff --git a/code/features-index/domain-ranking/build.gradle b/code/features-index/domain-ranking/build.gradle index 885787eb..11b0cd0a 100644 --- a/code/features-index/domain-ranking/build.gradle +++ b/code/features-index/domain-ranking/build.gradle @@ -20,6 +20,8 @@ dependencies { implementation project(':code:common:service-client') implementation project(':code:api:query-api') + implementation 'org.jgrapht:jgrapht-core:1.5.2' + implementation libs.bundles.slf4j implementation libs.bundles.mariadb implementation libs.guice @@ -27,8 +29,16 @@ dependencies { implementation libs.roaringbitmap implementation libs.trove implementation libs.fastutil + implementation libs.hll + testImplementation project(':code:libraries:array') + testImplementation libs.commons.lang3 testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito } + +test { + minHeapSize = "128m" // initial heap size + maxHeapSize = "20G" // maximum heap size +} \ No newline at end of file diff --git a/code/features-index/domain-ranking/readme.md b/code/features-index/domain-ranking/readme.md index 4274c417..5dc4ec2c 100644 --- a/code/features-index/domain-ranking/readme.md +++ b/code/features-index/domain-ranking/readme.md @@ -1,19 +1,34 @@ # Domain Ranking -Contains domain ranking algorithms. +Contains domain ranking algorithms. The domain ranking algorithms are based on +the JGraphT library. + +Two principal algorithms are available, the standard PageRank algorithm, +and personalized pagerank; each are available for two graphs, the link graph +and a similarity graph where each edge corresponds to the similarity between +the sets of incident links to two domains, their cosine similarity acting as +the weight of the links. + +With the standard PageRank algorithm, the similarity graph does not produce +anything useful, but something magical happens when you apply Personalized PageRank +to this graph. It turns into a very good "vibe"-sensitive ranking algorithm. + +It's unclear if this is a well known result, but it's a very interesting one +for creating a ranking algorithm that is focused on a particular segment of the web. ## Central Classes -### Algorithms -* [RankingAlgorithm](src/main/java/nu/marginalia/ranking/RankingAlgorithm.java) -* [StandardPageRank](src/main/java/nu/marginalia/ranking/StandardPageRank.java) -* [ReversePageRank](src/main/java/nu/marginalia/ranking/ReversePageRank.java) "CheiRank" +* [PageRankDomainRanker](src/main/java/nu/marginalia/ranking/PageRankDomainRanker.java) - Ranks domains using the + PageRank or Personalized PageRank algorithm depending on whether a list of influence domains is provided. ### Data sources -* [RankingDomainFetcher](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java) fetches link data. -* [RankingDomainFetcherForSimilarityData](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java) fetches website similarity data. +* [LinkGraphSource](src/main/java/nu/marginalia/ranking/data/LinkGraphSource.java) - fetches the link graph +* [InvertedLinkGraphSource](src/main/java/nu/marginalia/ranking/data/InvertedLinkGraphSource.java) - fetches the inverted link graph +* [SimilarityGraphSource](src/main/java/nu/marginalia/ranking/data/SimilarityGraphSource.java) - fetches the similarity graph from the database +Note that the similarity graph needs to be precomputed and stored in the database for +the similarity graph source to be available. ## See Also diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/PageRankDomainRanker.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/PageRankDomainRanker.java new file mode 100644 index 00000000..d73342f6 --- /dev/null +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/PageRankDomainRanker.java @@ -0,0 +1,54 @@ +package nu.marginalia.ranking; + +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; +import nu.marginalia.ranking.accumulator.RankingResultAccumulator; +import nu.marginalia.ranking.data.GraphSource; +import nu.marginalia.ranking.jgrapht.PersonalizedPageRank; +import org.jgrapht.Graph; +import org.jgrapht.alg.interfaces.VertexScoringAlgorithm; +import org.jgrapht.alg.scoring.PageRank; + +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.function.Supplier; + +public class PageRankDomainRanker implements RankingAlgorithm { + private final List influenceSet; + private final Graph graph; + + public PageRankDomainRanker(GraphSource source, + List influenceSet) + { + this.influenceSet = influenceSet; + this.graph = source.getGraph(); + } + + @Override + public T calculate(int resultCount, Supplier> accumulatorP) { + VertexScoringAlgorithm pageRank; + + if (influenceSet != null && !influenceSet.isEmpty()) { + pageRank = new PersonalizedPageRank<>(graph, influenceSet); + } + else { + pageRank = new PageRank<>(graph); + } + + TIntList results = new TIntArrayList(resultCount); + pageRank.getScores().entrySet() + .stream() + .sorted(Comparator.comparing((Map.Entry e) -> -e.getValue())) + .limit(resultCount) + .map(Map.Entry::getKey) + .forEach(results::add); + + var accumulator = accumulatorP.get(); + for (int i = 0; i < results.size(); i++) { + accumulator.add(results.get(i), i); + } + return accumulator.get(); + } + +} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/RankingAlgorithm.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/RankingAlgorithm.java index 606f3e60..f67d47be 100644 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/RankingAlgorithm.java +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/RankingAlgorithm.java @@ -1,281 +1,15 @@ package nu.marginalia.ranking; -import gnu.trove.list.array.TIntArrayList; -import gnu.trove.map.hash.TIntIntHashMap; -import gnu.trove.map.hash.TIntObjectHashMap; -import it.unimi.dsi.fastutil.ints.IntArrays; import nu.marginalia.ranking.accumulator.RankingResultAccumulator; -import nu.marginalia.ranking.data.RankingDomainFetcher; -import nu.marginalia.ranking.data.RankingDomainData; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; import java.util.function.Supplier; -import static java.lang.Math.min; - -public abstract class RankingAlgorithm { - protected final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); - protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap(); - protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap(); - - protected TIntArrayList[] linkDataSrc2Dest; - protected TIntArrayList[] linkDataDest2Src; - - public final Set originDomains = new HashSet<>(); - public final Set originDomainIds = new HashSet<>(); - - private int maxKnownUrls = Integer.MAX_VALUE; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final RankingDomainFetcher domains; - - public RankingAlgorithm(RankingDomainFetcher domains, String... origins) { - this.domains = domains; - - originDomains.addAll(Arrays.asList(origins)); - - domains.getDomains(domainData -> { - int id = domainData.id; - - domainsById.put(id, domainData); - - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); - }); - - linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()]; - linkDataDest2Src = new TIntArrayList[domainIndexToId.size()]; - - domains.eachDomainLink((src, dst) -> { - if (src == dst) return; - - if (domainsById.contains(src) && domainsById.contains(dst)) { - - int srcIdx = domainIdToIndex.get(src); - int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); - - if (linkDataSrc2Dest[srcIdx] == null) { - linkDataSrc2Dest[srcIdx] = new TIntArrayList(); - } - linkDataSrc2Dest[srcIdx].add(dstIdx); - - if (linkDataDest2Src[dstIdx] == null) { - linkDataDest2Src[dstIdx] = new TIntArrayList(); - } - linkDataDest2Src[dstIdx].add(srcIdx); - } - }); - - for (var namePattern : this.originDomains) { - domains.domainsByPattern(namePattern, i -> { - int ival = domainIdToIndex.get(i); - if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) { - originDomainIds.add(ival); - } - else { - logger.debug("No value for {}", i); - } - }); - } - - logger.info("Origin Domains: {}", originDomainIds.size()); - } - - public RankingDomainData getDomainData(int id) { - return domainsById.get(id); - } - - public void addPeripheralNodes() { - - int newNodesIdxCutoff = domainIdToIndex.size(); - - logger.info("Inserting peripheral nodes"); - - domains.getPeripheralDomains(domainData -> { - int id = domainData.id; - - if (domainsById.put(id, domainData) == null) { // true if id was not already present - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); - } - }); - - linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size()); - linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size()); - - domains.eachDomainLink((src, dst) -> { - if (src == dst) return; - - if (domainsById.contains(src) && domainsById.contains(dst)) { - int srcIdx = domainIdToIndex.get(src); - int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); - - // This looks like a bug, but it improves the results - if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff) - return; - - if (linkDataSrc2Dest[srcIdx] == null) { - linkDataSrc2Dest[srcIdx] = new TIntArrayList(); - } - linkDataSrc2Dest[srcIdx].add(dstIdx); - - if (linkDataDest2Src[dstIdx] == null) { - linkDataDest2Src[dstIdx] = new TIntArrayList(); - } - linkDataDest2Src[dstIdx].add(srcIdx); - } - }); - - logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size()); - } - - public int size() { - return domainsById.size(); - } - - public T pageRank(int resultCount, Supplier> accumulatorP) { - RankVector rank = new RankVector(1.d / domainsById.size()); - - int iter_max = 100; - for (int i = 0; i < iter_max; i++) { - RankVector newRank = createNewRankVector(rank); - - double oldNorm = rank.norm(); - double newNorm = newRank.norm(); - double dNorm = oldNorm - newNorm; - - if (i < iter_max-1) { - adjustRankVector(newRank, dNorm, oldNorm); - } - - rank = newRank; - } - - - return rank.getRanking(resultCount, accumulatorP).get(); - } - - public T pageRankWithPeripheralNodes(int resultCount, Supplier> accumulatorP) { - RankVector rank = new RankVector(1.d / domainsById.size()); - - int iter_max = 100; - - for (int i = 0; i < iter_max; i++) { - if (i == iter_max-1) { - addPeripheralNodes(); - } - RankVector newRank = createNewRankVector(rank); - - double oldNorm = rank.norm(); - double newNorm = newRank.norm(); - double dNorm = oldNorm - newNorm; - - if (i < iter_max-1) { - adjustRankVector(newRank, dNorm, oldNorm); - } - - rank = newRank; - } - - logger.info("PRWPN iteration done"); - - return rank.getRanking(resultCount, accumulatorP).get(); - } - - abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm); - - abstract RankVector createNewRankVector(RankVector rank); - - public boolean includeInRanking(RankingDomainData data) { - if (data.isAlias()) - return false; - if (data.isSpecial()) - return false; - if (data.isSocialMedia()) - return false; - if (data.knownUrls > maxKnownUrls) - return false; - - return true; - } - - public void setMaxKnownUrls(int maxKnownUrls) { - this.maxKnownUrls = maxKnownUrls; - } - public class RankVector { - private final double[] rank; - - public RankVector(double defaultValue) { - rank = new double[domainIndexToId.size()]; - if (defaultValue != 0.) { - Arrays.fill(rank, defaultValue); - } - } - - public void set(int id, double value) { - rank[id] = value; - } - - public void increment(int id, double value) { - rank[id] += value; - } - - public double get(int id) { - if (id >= rank.length) return 0.; - - return rank[id]; - } - - public double norm() { - double v = 0.; - for (double value : rank) { - v += Math.abs(value); - } - return v; - } - - public double norm(RankVector other) { - double v = 0.; - for (int i = 0; i < rank.length; i++) { - v += Math.abs(rank[i] - other.get(i)); - } - return v; - } - - public RankingResultAccumulator getRanking(int numResults, Supplier> accumulatorP) { - - if (numResults <= 0) { - numResults = domainIdToIndex.size(); - } - numResults = min(numResults, min(domainIdToIndex.size(), rank.length)); - - int[] nodes = sortOrder(rank); - var accumulator = accumulatorP.get(); - - for (int i = 0; i < numResults; i++) { - int id = domainIndexToId.get(nodes[i]); - - if (includeInRanking(domainsById.get(id))) - accumulator.add(id, i); - } - - return accumulator; - } - - private static int[] sortOrder(double[] values) { - - int[] ret = new int[values.length]; - Arrays.setAll(ret, i->i); - IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i])); - - return ret; - } - - } - +public interface RankingAlgorithm { + + /** Calculate domain rankings. + * + * @param resultCount update the best result count results + * @param accumulatorP the accumulator to use to store the results + */ + T calculate(int resultCount, Supplier> accumulatorP); } diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/ReversePageRank.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/ReversePageRank.java deleted file mode 100644 index 76b138f9..00000000 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/ReversePageRank.java +++ /dev/null @@ -1,42 +0,0 @@ -package nu.marginalia.ranking; - - -import nu.marginalia.ranking.data.RankingDomainFetcher; - -public class ReversePageRank extends RankingAlgorithm { - - - public ReversePageRank(RankingDomainFetcher domains, String... origins) { - super(domains, origins); - } - - @Override - RankVector createNewRankVector(RankVector rank) { - - double rankNorm = rank.norm(); - RankVector newRank = new RankVector(0); - - for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) { - - var links = linkDataSrc2Dest[domainId]; - double newRankValue = 0; - - if (links != null && links.size() > 0) { - for (int j = 0; j < links.size(); j++) { - var revLinks = linkDataDest2Src[links.getQuick(j)]; - newRankValue += rank.get(links.getQuick(j)) / revLinks.size(); - } - } - - newRank.set(domainId, 0.85*newRankValue/rankNorm); - } - - return newRank; - } - - @Override - void adjustRankVector(RankVector vector, double dNorm, double oldNorm) { - originDomainIds.forEach(id -> vector.increment(id, 1.0 / originDomainIds.size())); - } - -} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/StandardPageRank.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/StandardPageRank.java deleted file mode 100644 index 0c629c96..00000000 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/StandardPageRank.java +++ /dev/null @@ -1,50 +0,0 @@ -package nu.marginalia.ranking; - - -import nu.marginalia.ranking.data.RankingDomainFetcher; - -public class StandardPageRank extends RankingAlgorithm { - - public StandardPageRank(RankingDomainFetcher domains, String... origins) { - super(domains, origins); - } - - @Override - RankVector createNewRankVector(RankVector rank) { - RankVector newRank = new RankVector(0); - - for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) { - - var links = linkDataDest2Src[domainId]; - double newRankValue = 0; - - if (links != null && links.size() > 0) { - for (int j = 0; j < links.size(); j++) { - int linkedDomain = links.getQuick(j); - - final int linkSize; - var backLinks = linkDataSrc2Dest[linkedDomain]; - - if (backLinks == null) { - linkSize = 1; - } - else { - linkSize = backLinks.size(); - } - - newRankValue += rank.get(linkedDomain) / linkSize; - - } - } - - newRank.set(domainId, 0.85 * newRankValue); - } - return newRank; - } - - @Override - void adjustRankVector(RankVector vector, double dNorm, double oldNorm) { - originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() )); - } - -} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/AbstractGraphSource.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/AbstractGraphSource.java new file mode 100644 index 00000000..da9233c7 --- /dev/null +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/AbstractGraphSource.java @@ -0,0 +1,58 @@ +package nu.marginalia.ranking.data; + +import com.zaxxer.hikari.HikariDataSource; +import org.jgrapht.Graph; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +public abstract class AbstractGraphSource implements GraphSource { + protected final HikariDataSource dataSource; + + protected AbstractGraphSource(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + @Override + public abstract Graph getGraph(); + + /** Adds all indexed domain ids as vertices to the graph. */ + protected void addVertices(Graph graph) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID + FROM EC_DOMAIN + WHERE NODE_AFFINITY > 0 + """); + var rs = stmt.executeQuery()) + { + while (rs.next()) { + graph.addVertex(rs.getInt(1)); + } + } + } + + @Override + public List domainIds(List domainNameList) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID + FROM EC_DOMAIN + WHERE DOMAIN_NAME IN (?) + """)) + { + stmt.setArray(1, conn.createArrayOf("VARCHAR", domainNameList.toArray())); + try (var rs = stmt.executeQuery()) { + var result = new ArrayList(); + while (rs.next()) { + result.add(rs.getInt(1)); + } + return result; + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } +} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/GraphSource.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/GraphSource.java new file mode 100644 index 00000000..26554428 --- /dev/null +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/GraphSource.java @@ -0,0 +1,15 @@ +package nu.marginalia.ranking.data; + +import org.jgrapht.Graph; + +import java.util.List; + +/** A source for the link graph (or pseudo-link graph) + * to use when ranking domain. */ +public interface GraphSource { + + /** Construct the graph */ + Graph getGraph(); + + List domainIds(List domainNameList); +} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/InvertedLinkGraphSource.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/InvertedLinkGraphSource.java new file mode 100644 index 00000000..9d5564d0 --- /dev/null +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/InvertedLinkGraphSource.java @@ -0,0 +1,49 @@ +package nu.marginalia.ranking.data; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; +import nu.marginalia.query.client.QueryClient; +import org.jgrapht.Graph; +import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.graph.DefaultEdge; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +/** A source for the inverted link graph, + * which is the same as the regular graph except + * the direction of the links have been inverted */ +public class InvertedLinkGraphSource extends AbstractGraphSource { + private final QueryClient queryClient; + + @Inject + public InvertedLinkGraphSource(HikariDataSource dataSource, QueryClient queryClient) { + super(dataSource); + this.queryClient = queryClient; + } + @SneakyThrows + @Override + public Graph getGraph() { + Graph graph = new DefaultDirectedGraph<>(DefaultEdge.class); + + addVertices(graph); + + var allLinks = queryClient.getAllDomainLinks(); + var iter = allLinks.iterator(); + while (iter.advance()) { + if (!graph.containsVertex(iter.dest())) { + continue; + } + if (!graph.containsVertex(iter.source())) { + continue; + } + + // Invert the edge + graph.addEdge(iter.dest(), iter.source()); + } + + return graph; + } +} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/LinkGraphSource.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/LinkGraphSource.java new file mode 100644 index 00000000..cc7f2b53 --- /dev/null +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/LinkGraphSource.java @@ -0,0 +1,43 @@ +package nu.marginalia.ranking.data; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; +import nu.marginalia.query.client.QueryClient; +import org.jgrapht.Graph; +import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.graph.DefaultEdge; + +/** A source for the regular link graph. */ +public class LinkGraphSource extends AbstractGraphSource { + private final QueryClient queryClient; + + @Inject + public LinkGraphSource(HikariDataSource dataSource, QueryClient queryClient) { + super(dataSource); + this.queryClient = queryClient; + } + + @SneakyThrows + @Override + public Graph getGraph() { + Graph graph = new DefaultDirectedGraph<>(DefaultEdge.class); + + addVertices(graph); + + var allLinks = queryClient.getAllDomainLinks(); + var iter = allLinks.iterator(); + while (iter.advance()) { + if (!graph.containsVertex(iter.dest())) { + continue; + } + if (!graph.containsVertex(iter.source())) { + continue; + } + + graph.addEdge(iter.source(), iter.dest()); + } + + return graph; + } +} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainData.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainData.java deleted file mode 100644 index b4fa8abd..00000000 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainData.java +++ /dev/null @@ -1,32 +0,0 @@ -package nu.marginalia.ranking.data; - -import lombok.AllArgsConstructor; -import lombok.Data; -import nu.marginalia.model.crawl.DomainIndexingState; - -@Data -@AllArgsConstructor -public class RankingDomainData { - public final int id; - public final String name; - private int alias; - public DomainIndexingState state; - public final int knownUrls; - - public int resolveAlias() { - if (alias == 0) return id; - return alias; - } - - public boolean isAlias() { - return alias != 0; - } - - public boolean isSpecial() { - return DomainIndexingState.SPECIAL == state; - } - - public boolean isSocialMedia() { - return DomainIndexingState.SOCIAL_MEDIA == state; - } -} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java deleted file mode 100644 index ef0b83e6..00000000 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java +++ /dev/null @@ -1,138 +0,0 @@ -package nu.marginalia.ranking.data; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.db.DomainBlacklistImpl; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.query.client.QueryClient; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.function.Consumer; -import java.util.function.IntConsumer; - -@Singleton -public class RankingDomainFetcher { - protected final HikariDataSource dataSource; - private final QueryClient queryClient; - protected final DomainBlacklistImpl blacklist; - protected final Logger logger = LoggerFactory.getLogger(getClass()); - - protected boolean getNames = false; - - @Inject - public RankingDomainFetcher(HikariDataSource dataSource, - QueryClient queryClient, - DomainBlacklistImpl blacklist) { - this.dataSource = dataSource; - this.queryClient = queryClient; - this.blacklist = blacklist; - } - - public void retainNames() { - this.getNames = true; - } - - public void getDomains(Consumer consumer) { - String query; - if (getNames) { - query = """ - SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - WHERE NODE_AFFINITY>0 - GROUP BY EC_DOMAIN.ID - """; - } - else { - query = """ - SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - WHERE NODE_AFFINITY>0 - GROUP BY EC_DOMAIN.ID - """; - } - - getDomains(query, consumer); - } - - - public void getPeripheralDomains(Consumer consumer) { - String query; - if (getNames) { - query = """ - SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - WHERE ((INDEXED>1 AND IS_ALIVE) - OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) - GROUP BY EC_DOMAIN.ID - """; - } - else { - query = """ - SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - WHERE ((INDEXED>1 AND IS_ALIVE) - OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) - GROUP BY EC_DOMAIN.ID - """; - } - - getDomains(query, consumer); - } - - protected void getDomains(String query, Consumer consumer) { - try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - if (!blacklist.isBlacklisted(id)) { - consumer.accept( - new RankingDomainData(id, - rsp.getString(2), - rsp.getInt(3), - DomainIndexingState.valueOf(rsp.getString(4)), - rsp.getInt(5))); - } - } - } - catch (SQLException ex) { - logger.error("Failed to fetch domains", ex); - } - } - - public void eachDomainLink(DomainLinkConsumer consumer) { - - var allLinks = queryClient.getAllDomainLinks(); - var iter = allLinks.iterator(); - - while (iter.advance()) { - consumer.accept(iter.source(), iter.dest()); - } - - } - - public void domainsByPattern(String pattern, IntConsumer idConsumer) { - try (var conn = dataSource.getConnection(); - var stmt = conn.createStatement()) { - // This is sourced from a config file --v - var rsp = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE '" + pattern + "'"); - while (rsp.next()) { - idConsumer.accept(rsp.getInt(1)); - } - } - catch (SQLException ex) { - logger.error("Failed to fetch domains by pattern", ex); - } - } - - public interface DomainLinkConsumer { - void accept(int from, int to); - } -} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java deleted file mode 100644 index ae801166..00000000 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java +++ /dev/null @@ -1,93 +0,0 @@ -package nu.marginalia.ranking.data; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.db.DomainBlacklistImpl; -import nu.marginalia.query.client.QueryClient; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.function.Consumer; - -@Singleton -public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher { - final boolean hasData; - - @Inject - public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, QueryClient queryClient, DomainBlacklistImpl blacklist) { - super(dataSource, queryClient, blacklist); - - hasData = isDomainNeighborTablePopulated(dataSource); - } - - private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) { - try (var conn = dataSource.getConnection(); - var stmt = conn.createStatement(); - var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) { - - return rs.next(); - } - catch (SQLException ex) { - LoggerFactory - .getLogger(RankingDomainFetcherForSimilarityData.class) - .error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex); - return false; - } - } - public boolean hasData() { - return hasData; - } - - public void eachDomainLink(DomainLinkConsumer consumer) { - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2")) - { - stmt.setFetchSize(10000); - - var rsp = stmt.executeQuery(); - - while (rsp.next()) { - int src = rsp.getInt(1); - int dst = rsp.getInt(2); - - // these "links" are bidi - consumer.accept(src, dst); - consumer.accept(dst, src); - } - } - catch (SQLException ex) { - logger.error("Failed to fetch domain links", ex); - } - } - - public void getDomains(Consumer consumer) { - String query; - if (getNames) { - query = - """ - SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0) - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - GROUP BY EC_DOMAIN.ID - """; - } - else { - query = - """ - SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0) - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - GROUP BY EC_DOMAIN.ID - """; - } - - getDomains(query, consumer); - } - - - public void getPeripheralDomains(Consumer consumer) { - // This is not relevant for this variant of pagerank since it is bidirectional - } - -} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/SimilarityGraphSource.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/SimilarityGraphSource.java new file mode 100644 index 00000000..359f76fd --- /dev/null +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/SimilarityGraphSource.java @@ -0,0 +1,65 @@ +package nu.marginalia.ranking.data; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; +import org.jgrapht.Graph; +import org.jgrapht.graph.DefaultUndirectedWeightedGraph; +import org.jgrapht.graph.DefaultWeightedEdge; + +import java.sql.SQLException; + +/** A source for the similarity graph, stored in EC_DOMAIN_NEIGHBORS_2, + * which contains the cosine similarity of the incident link vectors in the link graph. + * */ +public class SimilarityGraphSource extends AbstractGraphSource { + @Inject + public SimilarityGraphSource(HikariDataSource dataSource) { + super(dataSource); + } + + /** Check if the data source is available. */ + public boolean isAvailable() { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT * + FROM EC_DOMAIN_NEIGHBORS_2 + LIMIT 1 + """); + var rs = stmt.executeQuery()) + { + return rs.next(); + } + catch (SQLException ex) { + return false; + } + } + + @SneakyThrows + @Override + public Graph getGraph() { + Graph graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class); + + addVertices(graph); + + try (var conn = dataSource.getConnection()) { + try (var stmt = conn.prepareStatement(""" + SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS + FROM EC_DOMAIN_NEIGHBORS_2 + """)) + { + var rs = stmt.executeQuery(); + while (rs.next()) { + int src = rs.getInt(1); + int dest = rs.getInt(2); + double weight = rs.getDouble(3); + + graph.addEdge(src, dest); + graph.setEdgeWeight(src, dest, weight); + } + } + } + + return graph; + } +} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/jgrapht/PersonalizedPageRank.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/jgrapht/PersonalizedPageRank.java new file mode 100644 index 00000000..0fd6a194 --- /dev/null +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/jgrapht/PersonalizedPageRank.java @@ -0,0 +1,375 @@ +package nu.marginalia.ranking.jgrapht; + +/* + * (C) Copyright 2016-2023, by Dimitrios Michail and Contributors. + * + * + * JGraphT : a free Java graph-theory library + * + * See the CONTRIBUTORS.md file distributed with this work for additional + * information regarding copyright ownership. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0, or the + * GNU Lesser General Public License v2.1 or later + * which is available at + * http://www.gnu.org/licenses/old-licenses/lgpl-2.1-standalone.html. + * + * SPDX-License-Identifier: EPL-2.0 OR LGPL-2.1-or-later + */ + +/* (modified by @vlofgren to add personalization) */ + +import org.jgrapht.*; +import org.jgrapht.alg.interfaces.*; + +import java.util.*; + +public class PersonalizedPageRank + implements VertexScoringAlgorithm +{ + /** + * Default number of maximum iterations. + */ + public static final int MAX_ITERATIONS_DEFAULT = 100; + + /** + * Default value for the tolerance. The calculation will stop if the difference of PageRank + * values between iterations change less than this value. + */ + public static final double TOLERANCE_DEFAULT = 0.0001; + + /** + * Damping factor default value. + */ + public static final double DAMPING_FACTOR_DEFAULT = 0.85d; + + /** + * The input graph + */ + private final Graph graph; + private final Collection influenceSet; + + /** + * The damping factor + */ + private final double dampingFactor; + + /** + * Maximum iterations to run + */ + private final int maxIterations; + + /** + * The calculation will stop if the difference of PageRank values between iterations change less + * than this value + */ + private final double tolerance; + + /** + * The result + */ + private Map scores; + + /** + * Create and execute an instance of Personalized PageRank. + * + * @param graph the input graph + * @param influenceSet the set of vertices to personalize the Personalized PageRank calculation + */ + public PersonalizedPageRank(Graph graph, Collection influenceSet) + { + this(graph, influenceSet, DAMPING_FACTOR_DEFAULT, MAX_ITERATIONS_DEFAULT, TOLERANCE_DEFAULT); + } + + /** + * Create and execute an instance of Personalized PageRank. + * + * @param graph the input graph + * @param influenceSet the set of vertices to personalize the Personalized PageRank calculation + * @param dampingFactor the damping factor + */ + public PersonalizedPageRank(Graph graph, Collection influenceSet, double dampingFactor) + { + this(graph, influenceSet, dampingFactor, MAX_ITERATIONS_DEFAULT, TOLERANCE_DEFAULT); + } + + /** + * Create and execute an instance of Personalized PageRank. + * + * @param graph the input graph + * @param influenceSet the set of vertices to personalize the Personalized PageRank calculation + * @param dampingFactor the damping factor + * @param maxIterations the maximum number of iterations to perform + */ + public PersonalizedPageRank(Graph graph, Collection influenceSet, double dampingFactor, int maxIterations) + { + this(graph, influenceSet, dampingFactor, maxIterations, TOLERANCE_DEFAULT); + } + + /** + * Create and execute an instance of Personalized PageRank. + * + * @param graph the input graph + * @param influenceSet the set of vertices to personalize the Personalized PageRank calculation + * @param dampingFactor the damping factor + * @param maxIterations the maximum number of iterations to perform + * @param tolerance the calculation will stop if the difference of Personalized PageRank values between + * iterations change less than this value + */ + public PersonalizedPageRank(Graph graph, Collection influenceSet, double dampingFactor, int maxIterations, double tolerance) + { + this.graph = graph; + this.influenceSet = influenceSet; + + if (maxIterations <= 0) { + throw new IllegalArgumentException("Maximum iterations must be positive"); + } + this.maxIterations = maxIterations; + + if (dampingFactor < 0.0 || dampingFactor > 1.0) { + throw new IllegalArgumentException("Damping factor not valid"); + } + this.dampingFactor = dampingFactor; + + if (tolerance <= 0.0) { + throw new IllegalArgumentException("Tolerance not valid, must be positive"); + } + this.tolerance = tolerance; + } + + /** + * {@inheritDoc} + */ + @Override + public Map getScores() + { + if (scores == null) { + scores = Collections.unmodifiableMap(new Algorithm().getScores()); + } + return scores; + } + + /** + * {@inheritDoc} + */ + @Override + public Double getVertexScore(V v) + { + if (!graph.containsVertex(v)) { + throw new IllegalArgumentException("Cannot return score of unknown vertex"); + } + return getScores().get(v); + } + + /** + * The actual implementation. + * + *

+ * We use this pattern with the inner class in order to be able to cache the result but also + * allow the garbage collector to acquire all auxiliary memory used during the execution of the + * algorithm. + * + * @author Dimitrios Michail + * + * @param the graph type + * @param the edge type + */ + private class Algorithm + { + private int totalVertices; + private boolean isWeighted; + + private Map vertexIndexMap; + private V[] vertexMap; + + private double[] weightSum; + private double[] curScore; + private double[] nextScore; + private int[] outDegree; + private ArrayList adjList; + private ArrayList weightsList; + private BitSet influenceIndexSet; + @SuppressWarnings("unchecked") + public Algorithm() + { + this.totalVertices = graph.vertexSet().size(); + this.isWeighted = graph.getType().isWeighted(); + + /* + * Initialize score, map vertices to [0,n) and pre-compute degrees and adjacency lists + */ + this.curScore = new double[totalVertices]; + this.nextScore = new double[totalVertices]; + this.vertexIndexMap = new HashMap<>(); + this.vertexMap = (V[]) new Object[totalVertices]; + this.outDegree = new int[totalVertices]; + this.adjList = new ArrayList<>(totalVertices); + this.influenceIndexSet = new BitSet(totalVertices); + + double initScore = 1.0d / totalVertices; + int i = 0; + for (V v : graph.vertexSet()) { + vertexIndexMap.put(v, i); + vertexMap[i] = v; + outDegree[i] = graph.outDegreeOf(v); + curScore[i] = initScore; + + if (influenceSet.contains(v)) { + influenceIndexSet.set(i); + } + + i++; + } + + if (isWeighted) { + this.weightSum = new double[totalVertices]; + this.weightsList = new ArrayList<>(totalVertices); + + for (i = 0; i < totalVertices; i++) { + V v = vertexMap[i]; + int[] inNeighbors = new int[graph.inDegreeOf(v)]; + double[] edgeWeights = new double[graph.inDegreeOf(v)]; + + int j = 0; + for (E e : graph.incomingEdgesOf(v)) { + V w = Graphs.getOppositeVertex(graph, e, v); + Integer mappedVertexId = vertexIndexMap.get(w); + inNeighbors[j] = mappedVertexId; + double edgeWeight = graph.getEdgeWeight(e); + edgeWeights[j] += edgeWeight; + weightSum[mappedVertexId] += edgeWeight; + j++; + } + weightsList.add(edgeWeights); + adjList.add(inNeighbors); + } + } else { + for (i = 0; i < totalVertices; i++) { + V v = vertexMap[i]; + int[] inNeighbors = new int[graph.inDegreeOf(v)]; + int j = 0; + for (E e : graph.incomingEdgesOf(v)) { + V w = Graphs.getOppositeVertex(graph, e, v); + inNeighbors[j++] = vertexIndexMap.get(w); + } + adjList.add(inNeighbors); + } + } + } + + public Map getScores() + { + // compute + if (isWeighted) { + runWeighted(); + } else { + run(); + } + + // make results user friendly + Map scores = new HashMap<>(); + for (int i = 0; i < totalVertices; i++) { + V v = vertexMap[i]; + scores.put(v, curScore[i]); + } + return scores; + } + + private void run() + { + double maxChange = tolerance; + int iterations = maxIterations; + + while (iterations > 0 && maxChange >= tolerance) { + double r = teleProp(); + + maxChange = 0d; + for (int i = 0; i < totalVertices; i++) { + double contribution = 0d; + for (int w : adjList.get(i)) { + contribution += dampingFactor * curScore[w] / outDegree[w]; + } + + double vOldValue = curScore[i]; + double vNewValue = (influenceIndexSet.get(i) ? r : 0) + contribution; + maxChange = Math.max(maxChange, Math.abs(vNewValue - vOldValue)); + nextScore[i] = vNewValue; + } + + // progress + swapScores(); + iterations--; + } + + // remove influence factor from the scores + double r = teleProp(); + for (int i = 0; i < totalVertices; i++) { + curScore[i] -= (influenceIndexSet.get(i) ? r : 0); + } + } + + private void runWeighted() + { + double maxChange = tolerance; + int iterations = maxIterations; + + while (iterations > 0 && maxChange >= tolerance) { + double r = teleProp(); + + maxChange = 0d; + for (int i = 0; i < totalVertices; i++) { + double contribution = 0d; + + int[] neighbors = adjList.get(i); + double[] weights = weightsList.get(i); + for (int j = 0, getLength = neighbors.length; j < getLength; j++) { + int w = neighbors[j]; + contribution += dampingFactor * curScore[w] * weights[j] / weightSum[w]; + } + + double vOldValue = curScore[i]; + double vNewValue = (influenceIndexSet.get(i) ? r : 0) + contribution; + maxChange = Math.max(maxChange, Math.abs(vNewValue - vOldValue)); + nextScore[i] = vNewValue; + } + + // progress + swapScores(); + iterations--; + } + + // remove influence factor from the scores + double r = teleProp(); + for (int i = 0; i < totalVertices; i++) { + curScore[i] -= (influenceIndexSet.get(i) ? r : 0); + } + } + + // This is the teleportation part of the algorithm, and also what is modified to personalize the PageRank + private double teleProp() + { + double r = 0d; + for (int v = influenceIndexSet.nextSetBit(0); + v >= 0; + v = influenceIndexSet.nextSetBit(v + 1)) + { + if (outDegree[v] > 0) + r += (1d - dampingFactor); + else + r += curScore[v]; + } + return r / influenceSet.size(); + } + + private void swapScores() + { + double[] tmp = curScore; + curScore = nextScore; + nextScore = tmp; + } + + } + +} \ No newline at end of file diff --git a/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/RankingAlgorithmWithRealDataTest.java b/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/RankingAlgorithmWithRealDataTest.java new file mode 100644 index 00000000..cd80dede --- /dev/null +++ b/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/RankingAlgorithmWithRealDataTest.java @@ -0,0 +1,75 @@ +package nu.marginalia.ranking; + +import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; +import org.junit.jupiter.api.Test; + +import java.util.List; + +// Test the ranking algorithm with prod data. +class RankingAlgorithmWithRealDataTest { + + @Test + public void testRegularPR() { + if (!TestGraphSourceForLinkData.isAvailable()) { + return; + } + + var graphSource = new TestGraphSourceForLinkData(); + var results = new PageRankDomainRanker(graphSource, List.of()) + .calculate(10, RankingResultListAccumulator::new); + + for (int i = 0; i < results.size(); i++) { + System.out.println(i + " " + graphSource.getName(results.get(i))); + } + } + + @Test + public void testInvertedLinkGraph() { + if (!TestGraphSourceForInvertedLinkData.isAvailable()) { + return; + } + + var graphSource = new TestGraphSourceForInvertedLinkData(); + var results = new PageRankDomainRanker(graphSource, List.of()) + .calculate(10, RankingResultListAccumulator::new); + + for (int i = 0; i < results.size(); i++) { + System.out.println(i + " " + graphSource.getName(results.get(i))); + } + } + + @Test + public void testSimilarityPR() { + if (!TestGraphSourceForSimilarityData.isAvailable()) { + return; + } + + var graphSource = new TestGraphSourceForSimilarityData(); + var results = new PageRankDomainRanker(graphSource, List.of()) + .calculate(10, RankingResultListAccumulator::new); + + for (int i = 0; i < results.size(); i++) { + System.out.println(i + " " + graphSource.getName(results.get(i))); + } + } + + @Test + public void testSimilarityPPR() { + if (!TestGraphSourceForSimilarityData.isAvailable()) { + return; + } + + var graphSource = new TestGraphSourceForSimilarityData(); + var results = new PageRankDomainRanker(graphSource, + List.of(1476552) // wiby.me + ) + .calculate(10, RankingResultListAccumulator::new); + + for (int i = 0; i < results.size(); i++) { + System.out.println(i + " " + graphSource.getName(results.get(i))); + } + } + + + +} \ No newline at end of file diff --git a/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/TestGraphSourceForInvertedLinkData.java b/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/TestGraphSourceForInvertedLinkData.java new file mode 100644 index 00000000..e07cd176 --- /dev/null +++ b/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/TestGraphSourceForInvertedLinkData.java @@ -0,0 +1,86 @@ +package nu.marginalia.ranking; + +import lombok.SneakyThrows; +import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.ranking.data.GraphSource; +import org.apache.commons.lang3.StringUtils; +import org.jgrapht.Graph; +import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.graph.DefaultEdge; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class TestGraphSourceForInvertedLinkData implements GraphSource { + private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv"); + private static Path[] linksDataPaths = new Path[] { + Paths.get("/home/vlofgren/Exports/Links/domain-links-1.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-2.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-3.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-4.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-5.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-6.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-7.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-8.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-9.dat") + }; + + public List domainIds(List domainNameList) { return List.of(); } + + static boolean isAvailable() { + return Files.exists(domainDataPath) && Files.exists(linksDataPaths[0]); + } + + private Map idToName = new HashMap<>(); + + public String getName(int id) { + return idToName.get(id); + } + + @SneakyThrows + @Override + public Graph getGraph() { + Graph graph = new DefaultDirectedGraph<>(DefaultEdge.class); + idToName = new HashMap<>(); + + try (var stream = Files + .lines(domainDataPath)) { + + stream.skip(1) + .mapMultiToInt((line, c) -> { + String[] parts = StringUtils.split(line, '\t'); + int id = Integer.parseInt(parts[0]); + String name = parts[1]; + int node_affinity = Integer.parseInt(parts[3]); + if (node_affinity > 0) { + c.accept(id); + idToName.put(id, parts[1]); + } + }) + .forEach(graph::addVertex); + } + + for (var path : linksDataPaths) { + try (var data = LongArrayFactory.mmapForReadingConfined(path)) { + data.forEach(0, data.size(), (pos, val) -> { + + val = Long.reverseBytes(val); // data is in "java endian", LongArray is in "C endian" + + int src = (int) (val >>> 32); + int dest = (int) (val & 0xFFFF_FFFFL); + + if (graph.containsVertex(src) && graph.containsVertex(dest)) { + graph.addEdge(dest, src); + } + }); + } + } + + return graph; + } + +} diff --git a/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/TestGraphSourceForLinkData.java b/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/TestGraphSourceForLinkData.java new file mode 100644 index 00000000..e009f628 --- /dev/null +++ b/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/TestGraphSourceForLinkData.java @@ -0,0 +1,86 @@ +package nu.marginalia.ranking; + +import lombok.SneakyThrows; +import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.ranking.data.GraphSource; +import org.apache.commons.lang3.StringUtils; +import org.jgrapht.Graph; +import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.graph.DefaultEdge; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class TestGraphSourceForLinkData implements GraphSource { + private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv"); + private static Path[] linksDataPaths = new Path[] { + Paths.get("/home/vlofgren/Exports/Links/domain-links-1.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-2.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-3.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-4.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-5.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-6.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-7.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-8.dat"), + Paths.get("/home/vlofgren/Exports/Links/domain-links-9.dat") + }; + + public List domainIds(List domainNameList) { return List.of(); } + + static boolean isAvailable() { + return Files.exists(domainDataPath) && Files.exists(linksDataPaths[0]); + } + + private Map idToName = new HashMap<>(); + + public String getName(int id) { + return idToName.get(id); + } + + @SneakyThrows + @Override + public Graph getGraph() { + Graph graph = new DefaultDirectedGraph<>(DefaultEdge.class); + idToName = new HashMap<>(); + + try (var stream = Files + .lines(domainDataPath)) { + + stream.skip(1) + .mapMultiToInt((line, c) -> { + String[] parts = StringUtils.split(line, '\t'); + int id = Integer.parseInt(parts[0]); + String name = parts[1]; + int node_affinity = Integer.parseInt(parts[3]); + if (node_affinity > 0) { + c.accept(id); + idToName.put(id, parts[1]); + } + }) + .forEach(graph::addVertex); + } + + for (var path : linksDataPaths) { + try (var data = LongArrayFactory.mmapForReadingConfined(path)) { + data.forEach(0, data.size(), (pos, val) -> { + + val = Long.reverseBytes(val); // data is in "java endian", LongArray is in "C endian" + + int src = (int) (val >>> 32); + int dest = (int) (val & 0xFFFF_FFFFL); + + if (graph.containsVertex(src) && graph.containsVertex(dest)) { + graph.addEdge(src, dest); + } + }); + } + } + + return graph; + } + +} diff --git a/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/TestGraphSourceForSimilarityData.java b/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/TestGraphSourceForSimilarityData.java new file mode 100644 index 00000000..47fd6e40 --- /dev/null +++ b/code/features-index/domain-ranking/src/test/java/nu/marginalia/ranking/TestGraphSourceForSimilarityData.java @@ -0,0 +1,78 @@ +package nu.marginalia.ranking; + +import lombok.SneakyThrows; +import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.ranking.data.GraphSource; +import org.apache.commons.lang3.StringUtils; +import org.jgrapht.Graph; +import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.graph.DefaultEdge; +import org.jgrapht.graph.DefaultUndirectedWeightedGraph; +import org.jgrapht.graph.DefaultWeightedEdge; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class TestGraphSourceForSimilarityData implements GraphSource { + private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv"); + private static Path similarityDataPath = Paths.get("/home/vlofgren/Exports/Links/neighbors.tsv"); + + public List domainIds(List domainNameList) { return List.of(); } + + static boolean isAvailable() { + return Files.exists(domainDataPath) && Files.exists(similarityDataPath); + } + + private Map idToName = new HashMap<>(); + + public String getName(int id) { + return idToName.get(id); + } + + @SneakyThrows + @Override + public Graph getGraph() { + Graph graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class); + idToName = new HashMap<>(); + + try (var stream = Files + .lines(domainDataPath)) { + + stream.skip(1) + .mapMultiToInt((line, c) -> { + String[] parts = StringUtils.split(line, '\t'); + int id = Integer.parseInt(parts[0]); + String name = parts[1]; + int node_affinity = Integer.parseInt(parts[3]); + if (node_affinity > 0) { + c.accept(id); + idToName.put(id, name); + } + }) + .forEach(graph::addVertex); + } + + try (var stream = Files + .lines(similarityDataPath)) { + + stream.skip(1) + .forEach(line -> { + String[] parts = StringUtils.split(line, '\t'); + int src = Integer.parseInt(parts[0]); + int dest = Integer.parseInt(parts[1]); + double weight = Double.parseDouble(parts[2]); + if (graph.containsVertex(src) && graph.containsVertex(dest)) { + graph.addEdge(src, dest); + graph.setEdgeWeight(src, dest, weight); + } + }); + } + + return graph; + } + +} diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java index 958421ed..690a60bc 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java @@ -8,17 +8,15 @@ import nu.marginalia.db.DomainRankingSetsService; import nu.marginalia.db.DomainTypes; import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.searchset.SearchSet; -import nu.marginalia.ranking.RankingAlgorithm; -import nu.marginalia.ranking.ReversePageRank; -import nu.marginalia.ranking.StandardPageRank; +import nu.marginalia.ranking.*; import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator; import nu.marginalia.ranking.accumulator.RankingResultHashSetAccumulator; -import nu.marginalia.ranking.data.RankingDomainFetcher; -import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; import nu.marginalia.index.svc.searchset.RankingSearchSet; import nu.marginalia.index.svc.searchset.SearchSetAny; -import nu.marginalia.ranking.DomainRankings; import nu.marginalia.index.db.DbUpdateRanks; +import nu.marginalia.ranking.data.GraphSource; +import nu.marginalia.ranking.data.LinkGraphSource; +import nu.marginalia.ranking.data.SimilarityGraphSource; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.module.ServiceConfiguration; @@ -27,6 +25,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.sql.SQLException; +import java.util.List; import java.util.Objects; import java.util.concurrent.ConcurrentHashMap; @@ -34,13 +33,12 @@ import java.util.concurrent.ConcurrentHashMap; public class IndexSearchSetsService { private final Logger logger = LoggerFactory.getLogger(getClass()); private final DomainTypes domainTypes; - private final ServiceHeartbeat heartbeat; private final IndexServicesFactory indexServicesFactory; private final ServiceEventLog eventLog; private final DomainRankingSetsService domainRankingSetsService; private final DbUpdateRanks dbUpdateRanks; - private final RankingDomainFetcher similarityDomains; - private final RankingDomainFetcher linksDomains; + private final GraphSource similarityDomains; + private final GraphSource linksDomains; private final ConcurrentHashMap rankingSets = new ConcurrentHashMap<>(); // Below are binary indices that are used to constrain a search @@ -55,23 +53,21 @@ public class IndexSearchSetsService { @Inject public IndexSearchSetsService(DomainTypes domainTypes, ServiceConfiguration serviceConfiguration, - ServiceHeartbeat heartbeat, - RankingDomainFetcher rankingDomains, - RankingDomainFetcherForSimilarityData similarityDomains, + LinkGraphSource rankingDomains, + SimilarityGraphSource similarityDomains, IndexServicesFactory indexServicesFactory, ServiceEventLog eventLog, DomainRankingSetsService domainRankingSetsService, DbUpdateRanks dbUpdateRanks) throws IOException { this.nodeId = serviceConfiguration.node(); this.domainTypes = domainTypes; - this.heartbeat = heartbeat; this.indexServicesFactory = indexServicesFactory; this.eventLog = eventLog; this.domainRankingSetsService = domainRankingSetsService; this.dbUpdateRanks = dbUpdateRanks; - if (similarityDomains.hasData()) { + if (similarityDomains.isAvailable()) { this.similarityDomains = similarityDomains; this.linksDomains = rankingDomains; } @@ -145,15 +141,14 @@ public class IndexSearchSetsService { private void recalculateNornal(DomainRankingSetsService.DomainRankingSet rankingSet) { String[] domains = rankingSet.domains(); - RankingAlgorithm rankingAlgorithm = switch (rankingSet.algorithm()) { - case LINKS_PAGERANK -> new StandardPageRank(linksDomains, domains); - case LINKS_CHEIRANK -> new ReversePageRank(linksDomains, domains); - case ADJACENCY_PAGERANK -> new StandardPageRank(similarityDomains, domains); - case ADJACENCY_CHEIRANK -> new ReversePageRank(similarityDomains, domains); + GraphSource graphSource = switch (rankingSet.algorithm()) { + case LINKS_PAGERANK, LINKS_CHEIRANK -> linksDomains; + case ADJACENCY_PAGERANK, ADJACENCY_CHEIRANK -> similarityDomains; default -> throw new IllegalStateException("Unexpected value: " + rankingSet.algorithm()); }; - var data = rankingAlgorithm.pageRankWithPeripheralNodes(rankingSet.depth(), RankingResultHashSetAccumulator::new); + var data = new PageRankDomainRanker(linksDomains, linksDomains.domainIds(List.of(domains))) + .calculate(rankingSet.depth(), RankingResultHashSetAccumulator::new); var set = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), data); rankingSets.put(rankingSet.name(), set); @@ -186,8 +181,8 @@ public class IndexSearchSetsService { private void updateDomainRankings(DomainRankingSetsService.DomainRankingSet rankingSet) { - var spr = new StandardPageRank(similarityDomains, rankingSet.domains()); - var ranks = spr.pageRankWithPeripheralNodes(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth())); + var ranks = new PageRankDomainRanker(similarityDomains, similarityDomains.domainIds(List.of(rankingSet.domains()))) + .calculate(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth())); synchronized (this) { domainRankings = new DomainRankings(ranks);