Merge pull request #80 from MarginaliaSearch/ranking-algorithms

Clean up domain ranking code
This commit is contained in:
Viktor 2024-02-18 09:52:34 +01:00 committed by GitHub
commit d05c916491
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 1267 additions and 754 deletions

View File

@ -25,7 +25,7 @@ public class DomainRankingSetsService {
public Optional<DomainRankingSet> get(String name) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION
SELECT NAME, DESCRIPTION, DEPTH, DEFINITION
FROM CONF_DOMAIN_RANKING_SET
WHERE NAME = ?
""")) {
@ -39,7 +39,6 @@ public class DomainRankingSetsService {
return Optional.of(new DomainRankingSet(
rs.getString("NAME"),
rs.getString("DESCRIPTION"),
DomainSetAlgorithm.valueOf(rs.getString("ALGORITHM")),
rs.getInt("DEPTH"),
rs.getString("DEFINITION")
));
@ -53,15 +52,14 @@ public class DomainRankingSetsService {
public void upsert(DomainRankingSet domainRankingSet) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
REPLACE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION)
VALUES (?, ?, ?, ?, ?)
REPLACE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, DEPTH, DEFINITION)
VALUES (?, ?, ?, ?)
"""))
{
stmt.setString(1, domainRankingSet.name());
stmt.setString(2, domainRankingSet.description());
stmt.setString(3, domainRankingSet.algorithm().name());
stmt.setInt(4, domainRankingSet.depth());
stmt.setString(5, domainRankingSet.definition());
stmt.setInt(3, domainRankingSet.depth());
stmt.setString(4, domainRankingSet.definition());
stmt.executeUpdate();
if (!conn.getAutoCommit())
@ -94,7 +92,7 @@ public class DomainRankingSetsService {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION
SELECT NAME, DESCRIPTION, DEPTH, DEFINITION
FROM CONF_DOMAIN_RANKING_SET
""")) {
var rs = stmt.executeQuery();
@ -105,7 +103,6 @@ public class DomainRankingSetsService {
new DomainRankingSet(
rs.getString("NAME"),
rs.getString("DESCRIPTION"),
DomainSetAlgorithm.valueOf(rs.getString("ALGORITHM")),
rs.getInt("DEPTH"),
rs.getString("DEFINITION"))
);
@ -118,31 +115,17 @@ public class DomainRankingSetsService {
}
}
public enum DomainSetAlgorithm {
/** Use link graph, do a pagerank */
LINKS_PAGERANK,
/** Use link graph, do a cheirank */
LINKS_CHEIRANK,
/** Use adjacency graph, do a pagerank */
ADJACENCY_PAGERANK,
/** Use adjacency graph, do a cheirank */
ADJACENCY_CHEIRANK,
/** For reserved names. Use special algorithm, function of name */
SPECIAL
};
/** Defines a domain ranking set, parameters for the ranking algorithms.
*
* @param name Key and name of the set
* @param description Human-readable description
* @param algorithm Algorithm to use
* @param depth Depth of the algorithm
* @param definition Definition of the set, typically a list of domains or globs for domain-names
* */
@With
public record DomainRankingSet(String name,
String description,
DomainSetAlgorithm algorithm,
int depth,
String definition)
{
@ -159,7 +142,7 @@ public class DomainRankingSetsService {
}
public boolean isSpecial() {
return algorithm() == DomainSetAlgorithm.SPECIAL;
return name().equals("BLOGS") || name().equals("NONE") || name().equals("RANK");
}
}

View File

@ -0,0 +1 @@
ALTER TABLE CONF_DOMAIN_RANKING_SET DROP COLUMN ALGORITHM;

View File

@ -56,14 +56,12 @@ class DomainRankingSetsServiceTest {
var newValue = new DomainRankingSetsService.DomainRankingSet(
"test",
"Test domain set",
DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_CHEIRANK,
10,
"test\\.nu"
);
var newValue2 = new DomainRankingSetsService.DomainRankingSet(
"test2",
"Test domain set 2",
DomainRankingSetsService.DomainSetAlgorithm.LINKS_PAGERANK,
20,
"test\\.nu 2"
);

View File

@ -20,6 +20,8 @@ dependencies {
implementation project(':code:common:service-client')
implementation project(':code:api:query-api')
implementation 'org.jgrapht:jgrapht-core:1.5.2'
implementation libs.bundles.slf4j
implementation libs.bundles.mariadb
implementation libs.guice
@ -27,8 +29,17 @@ dependencies {
implementation libs.roaringbitmap
implementation libs.trove
implementation libs.fastutil
implementation libs.hll
testImplementation project(':code:libraries:array')
testImplementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
}

View File

@ -1,19 +1,34 @@
# Domain Ranking
Contains domain ranking algorithms.
Contains domain ranking algorithms. The domain ranking algorithms are based on
the JGraphT library.
Two principal algorithms are available, the standard PageRank algorithm,
and personalized pagerank; each are available for two graphs, the link graph
and a similarity graph where each edge corresponds to the similarity between
the sets of incident links to two domains, their cosine similarity acting as
the weight of the links.
With the standard PageRank algorithm, the similarity graph does not produce
anything useful, but something magical happens when you apply Personalized PageRank
to this graph. It turns into a very good "vibe"-sensitive ranking algorithm.
It's unclear if this is a well known result, but it's a very interesting one
for creating a ranking algorithm that is focused on a particular segment of the web.
## Central Classes
### Algorithms
* [RankingAlgorithm](src/main/java/nu/marginalia/ranking/RankingAlgorithm.java)
* [StandardPageRank](src/main/java/nu/marginalia/ranking/StandardPageRank.java)
* [ReversePageRank](src/main/java/nu/marginalia/ranking/ReversePageRank.java) "CheiRank"
* [PageRankDomainRanker](src/main/java/nu/marginalia/ranking/PageRankDomainRanker.java) - Ranks domains using the
PageRank or Personalized PageRank algorithm depending on whether a list of influence domains is provided.
### Data sources
* [RankingDomainFetcher](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java) fetches link data.
* [RankingDomainFetcherForSimilarityData](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java) fetches website similarity data.
* [LinkGraphSource](src/main/java/nu/marginalia/ranking/data/LinkGraphSource.java) - fetches the link graph
* [InvertedLinkGraphSource](src/main/java/nu/marginalia/ranking/data/InvertedLinkGraphSource.java) - fetches the inverted link graph
* [SimilarityGraphSource](src/main/java/nu/marginalia/ranking/data/SimilarityGraphSource.java) - fetches the similarity graph from the database
Note that the similarity graph needs to be precomputed and stored in the database for
the similarity graph source to be available.
## See Also

View File

@ -0,0 +1,60 @@
package nu.marginalia.ranking;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.ranking.accumulator.RankingResultAccumulator;
import nu.marginalia.ranking.data.GraphSource;
import nu.marginalia.ranking.jgrapht.PersonalizedPageRank;
import org.jgrapht.Graph;
import org.jgrapht.alg.interfaces.VertexScoringAlgorithm;
import org.jgrapht.alg.scoring.PageRank;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.function.Supplier;
public class PageRankDomainRanker implements RankingAlgorithm {
private final List<Integer> influenceSet;
private final Graph<Integer, ?> graph;
public PageRankDomainRanker(GraphSource source,
List<Integer> influenceSet)
{
this.influenceSet = influenceSet;
this.graph = source.getGraph();
}
public static PageRankDomainRanker forDomainNames(GraphSource source,
List<String> influenceSet)
{
return new PageRankDomainRanker(source, source.domainIds(influenceSet));
}
@Override
public <T> T calculate(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
VertexScoringAlgorithm<Integer, Double> pageRank;
if (influenceSet != null && !influenceSet.isEmpty()) {
pageRank = new PersonalizedPageRank<>(graph, influenceSet);
}
else {
pageRank = new PageRank<>(graph);
}
TIntList results = new TIntArrayList(resultCount);
pageRank.getScores().entrySet()
.stream()
.sorted(Comparator.comparing((Map.Entry<Integer, Double> e) -> -e.getValue()))
.limit(resultCount)
.map(Map.Entry::getKey)
.forEach(results::add);
var accumulator = accumulatorP.get();
for (int i = 0; i < results.size(); i++) {
accumulator.add(results.get(i), i);
}
return accumulator.get();
}
}

View File

@ -1,281 +1,15 @@
package nu.marginalia.ranking;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import it.unimi.dsi.fastutil.ints.IntArrays;
import nu.marginalia.ranking.accumulator.RankingResultAccumulator;
import nu.marginalia.ranking.data.RankingDomainFetcher;
import nu.marginalia.ranking.data.RankingDomainData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.function.Supplier;
import static java.lang.Math.min;
public abstract class RankingAlgorithm {
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
protected TIntArrayList[] linkDataSrc2Dest;
protected TIntArrayList[] linkDataDest2Src;
public final Set<String> originDomains = new HashSet<>();
public final Set<Integer> originDomainIds = new HashSet<>();
private int maxKnownUrls = Integer.MAX_VALUE;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final RankingDomainFetcher domains;
public RankingAlgorithm(RankingDomainFetcher domains, String... origins) {
this.domains = domains;
originDomains.addAll(Arrays.asList(origins));
domains.getDomains(domainData -> {
int id = domainData.id;
domainsById.put(id, domainData);
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
});
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
domains.eachDomainLink((src, dst) -> {
if (src == dst) return;
if (domainsById.contains(src) && domainsById.contains(dst)) {
int srcIdx = domainIdToIndex.get(src);
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
}
linkDataSrc2Dest[srcIdx].add(dstIdx);
if (linkDataDest2Src[dstIdx] == null) {
linkDataDest2Src[dstIdx] = new TIntArrayList();
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
});
for (var namePattern : this.originDomains) {
domains.domainsByPattern(namePattern, i -> {
int ival = domainIdToIndex.get(i);
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
originDomainIds.add(ival);
}
else {
logger.debug("No value for {}", i);
}
});
}
logger.info("Origin Domains: {}", originDomainIds.size());
}
public RankingDomainData getDomainData(int id) {
return domainsById.get(id);
}
public void addPeripheralNodes() {
int newNodesIdxCutoff = domainIdToIndex.size();
logger.info("Inserting peripheral nodes");
domains.getPeripheralDomains(domainData -> {
int id = domainData.id;
if (domainsById.put(id, domainData) == null) { // true if id was not already present
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
}
});
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
domains.eachDomainLink((src, dst) -> {
if (src == dst) return;
if (domainsById.contains(src) && domainsById.contains(dst)) {
int srcIdx = domainIdToIndex.get(src);
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
// This looks like a bug, but it improves the results
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
return;
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
}
linkDataSrc2Dest[srcIdx].add(dstIdx);
if (linkDataDest2Src[dstIdx] == null) {
linkDataDest2Src[dstIdx] = new TIntArrayList();
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
});
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
}
public int size() {
return domainsById.size();
}
public <T> T pageRank(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
for (int i = 0; i < iter_max; i++) {
RankVector newRank = createNewRankVector(rank);
double oldNorm = rank.norm();
double newNorm = newRank.norm();
double dNorm = oldNorm - newNorm;
if (i < iter_max-1) {
adjustRankVector(newRank, dNorm, oldNorm);
}
rank = newRank;
}
return rank.getRanking(resultCount, accumulatorP).get();
}
public <T> T pageRankWithPeripheralNodes(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
for (int i = 0; i < iter_max; i++) {
if (i == iter_max-1) {
addPeripheralNodes();
}
RankVector newRank = createNewRankVector(rank);
double oldNorm = rank.norm();
double newNorm = newRank.norm();
double dNorm = oldNorm - newNorm;
if (i < iter_max-1) {
adjustRankVector(newRank, dNorm, oldNorm);
}
rank = newRank;
}
logger.info("PRWPN iteration done");
return rank.getRanking(resultCount, accumulatorP).get();
}
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
abstract RankVector createNewRankVector(RankVector rank);
public boolean includeInRanking(RankingDomainData data) {
if (data.isAlias())
return false;
if (data.isSpecial())
return false;
if (data.isSocialMedia())
return false;
if (data.knownUrls > maxKnownUrls)
return false;
return true;
}
public void setMaxKnownUrls(int maxKnownUrls) {
this.maxKnownUrls = maxKnownUrls;
}
public class RankVector {
private final double[] rank;
public RankVector(double defaultValue) {
rank = new double[domainIndexToId.size()];
if (defaultValue != 0.) {
Arrays.fill(rank, defaultValue);
}
}
public void set(int id, double value) {
rank[id] = value;
}
public void increment(int id, double value) {
rank[id] += value;
}
public double get(int id) {
if (id >= rank.length) return 0.;
return rank[id];
}
public double norm() {
double v = 0.;
for (double value : rank) {
v += Math.abs(value);
}
return v;
}
public double norm(RankVector other) {
double v = 0.;
for (int i = 0; i < rank.length; i++) {
v += Math.abs(rank[i] - other.get(i));
}
return v;
}
public <T> RankingResultAccumulator<T> getRanking(int numResults, Supplier<RankingResultAccumulator<T>> accumulatorP) {
if (numResults <= 0) {
numResults = domainIdToIndex.size();
}
numResults = min(numResults, min(domainIdToIndex.size(), rank.length));
int[] nodes = sortOrder(rank);
var accumulator = accumulatorP.get();
for (int i = 0; i < numResults; i++) {
int id = domainIndexToId.get(nodes[i]);
if (includeInRanking(domainsById.get(id)))
accumulator.add(id, i);
}
return accumulator;
}
private static int[] sortOrder(double[] values) {
int[] ret = new int[values.length];
Arrays.setAll(ret, i->i);
IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i]));
return ret;
}
}
public interface RankingAlgorithm {
/** Calculate domain rankings.
*
* @param resultCount update the best result count results
* @param accumulatorP the accumulator to use to store the results
*/
<T> T calculate(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP);
}

View File

@ -1,42 +0,0 @@
package nu.marginalia.ranking;
import nu.marginalia.ranking.data.RankingDomainFetcher;
public class ReversePageRank extends RankingAlgorithm {
public ReversePageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override
RankVector createNewRankVector(RankVector rank) {
double rankNorm = rank.norm();
RankVector newRank = new RankVector(0);
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
var links = linkDataSrc2Dest[domainId];
double newRankValue = 0;
if (links != null && links.size() > 0) {
for (int j = 0; j < links.size(); j++) {
var revLinks = linkDataDest2Src[links.getQuick(j)];
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();
}
}
newRank.set(domainId, 0.85*newRankValue/rankNorm);
}
return newRank;
}
@Override
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
originDomainIds.forEach(id -> vector.increment(id, 1.0 / originDomainIds.size()));
}
}

View File

@ -1,50 +0,0 @@
package nu.marginalia.ranking;
import nu.marginalia.ranking.data.RankingDomainFetcher;
public class StandardPageRank extends RankingAlgorithm {
public StandardPageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override
RankVector createNewRankVector(RankVector rank) {
RankVector newRank = new RankVector(0);
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
var links = linkDataDest2Src[domainId];
double newRankValue = 0;
if (links != null && links.size() > 0) {
for (int j = 0; j < links.size(); j++) {
int linkedDomain = links.getQuick(j);
final int linkSize;
var backLinks = linkDataSrc2Dest[linkedDomain];
if (backLinks == null) {
linkSize = 1;
}
else {
linkSize = backLinks.size();
}
newRankValue += rank.get(linkedDomain) / linkSize;
}
}
newRank.set(domainId, 0.85 * newRankValue);
}
return newRank;
}
@Override
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() ));
}
}

View File

@ -0,0 +1,63 @@
package nu.marginalia.ranking.data;
import com.zaxxer.hikari.HikariDataSource;
import org.jgrapht.Graph;
import java.sql.SQLException;
import java.util.*;
public abstract class AbstractGraphSource implements GraphSource {
protected final HikariDataSource dataSource;
protected AbstractGraphSource(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
@Override
public abstract Graph<Integer, ?> getGraph();
/** Adds all indexed domain ids as vertices to the graph. */
protected void addVertices(Graph<Integer, ?> graph) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT ID
FROM EC_DOMAIN
WHERE NODE_AFFINITY > 0
""");
var rs = stmt.executeQuery())
{
while (rs.next()) {
graph.addVertex(rs.getInt(1));
}
}
}
@Override
public List<Integer> domainIds(List<String> domainNameList) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT ID
FROM EC_DOMAIN
WHERE DOMAIN_NAME LIKE ?
"""))
{
Set<Integer> retSet = new HashSet<>();
for (String domainName : domainNameList) {
stmt.setString(1, domainName);
try (var rs = stmt.executeQuery()) {
while (rs.next()) {
retSet.add(rs.getInt(1));
}
}
}
var ret = new ArrayList<>(retSet);
ret.sort(Comparator.naturalOrder());
return ret;
}
catch (SQLException ex) {
throw new RuntimeException(ex);
}
}
}

View File

@ -0,0 +1,23 @@
package nu.marginalia.ranking.data;
import org.jgrapht.Graph;
import java.util.List;
/** A source for the link graph (or pseudo-link graph)
* to use when ranking domain. */
public interface GraphSource {
/** Construct the graph */
Graph<Integer, ?> getGraph();
/** Return a list of domain ids for the given domain names.
* The function will also accept SQL-style wildcards,
* e.g. "%marginalia.nu" will match "marginalia.nu" and "memex.marginalia.nu".
* <p></p>
* If multiple wildcards are provided, and overlapping domains are matched,
* they will be included only once. The returned list will be sorted in
* numerical order of the domain IDs.
*/
List<Integer> domainIds(List<String> domainNameList);
}

View File

@ -0,0 +1,49 @@
package nu.marginalia.ranking.data;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.query.client.QueryClient;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
/** A source for the inverted link graph,
* which is the same as the regular graph except
* the direction of the links have been inverted */
public class InvertedLinkGraphSource extends AbstractGraphSource {
private final QueryClient queryClient;
@Inject
public InvertedLinkGraphSource(HikariDataSource dataSource, QueryClient queryClient) {
super(dataSource);
this.queryClient = queryClient;
}
@SneakyThrows
@Override
public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
addVertices(graph);
var allLinks = queryClient.getAllDomainLinks();
var iter = allLinks.iterator();
while (iter.advance()) {
if (!graph.containsVertex(iter.dest())) {
continue;
}
if (!graph.containsVertex(iter.source())) {
continue;
}
// Invert the edge
graph.addEdge(iter.dest(), iter.source());
}
return graph;
}
}

View File

@ -0,0 +1,43 @@
package nu.marginalia.ranking.data;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.query.client.QueryClient;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
/** A source for the regular link graph. */
public class LinkGraphSource extends AbstractGraphSource {
private final QueryClient queryClient;
@Inject
public LinkGraphSource(HikariDataSource dataSource, QueryClient queryClient) {
super(dataSource);
this.queryClient = queryClient;
}
@SneakyThrows
@Override
public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
addVertices(graph);
var allLinks = queryClient.getAllDomainLinks();
var iter = allLinks.iterator();
while (iter.advance()) {
if (!graph.containsVertex(iter.dest())) {
continue;
}
if (!graph.containsVertex(iter.source())) {
continue;
}
graph.addEdge(iter.source(), iter.dest());
}
return graph;
}
}

View File

@ -1,32 +0,0 @@
package nu.marginalia.ranking.data;
import lombok.AllArgsConstructor;
import lombok.Data;
import nu.marginalia.model.crawl.DomainIndexingState;
@Data
@AllArgsConstructor
public class RankingDomainData {
public final int id;
public final String name;
private int alias;
public DomainIndexingState state;
public final int knownUrls;
public int resolveAlias() {
if (alias == 0) return id;
return alias;
}
public boolean isAlias() {
return alias != 0;
}
public boolean isSpecial() {
return DomainIndexingState.SPECIAL == state;
}
public boolean isSocialMedia() {
return DomainIndexingState.SOCIAL_MEDIA == state;
}
}

View File

@ -1,138 +0,0 @@
package nu.marginalia.ranking.data;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.query.client.QueryClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.function.Consumer;
import java.util.function.IntConsumer;
@Singleton
public class RankingDomainFetcher {
protected final HikariDataSource dataSource;
private final QueryClient queryClient;
protected final DomainBlacklistImpl blacklist;
protected final Logger logger = LoggerFactory.getLogger(getClass());
protected boolean getNames = false;
@Inject
public RankingDomainFetcher(HikariDataSource dataSource,
QueryClient queryClient,
DomainBlacklistImpl blacklist) {
this.dataSource = dataSource;
this.queryClient = queryClient;
this.blacklist = blacklist;
}
public void retainNames() {
this.getNames = true;
}
public void getDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query = """
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE NODE_AFFINITY>0
GROUP BY EC_DOMAIN.ID
""";
}
else {
query = """
SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE NODE_AFFINITY>0
GROUP BY EC_DOMAIN.ID
""";
}
getDomains(query, consumer);
}
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query = """
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE ((INDEXED>1 AND IS_ALIVE)
OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0))
GROUP BY EC_DOMAIN.ID
""";
}
else {
query = """
SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE ((INDEXED>1 AND IS_ALIVE)
OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0))
GROUP BY EC_DOMAIN.ID
""";
}
getDomains(query, consumer);
}
protected void getDomains(String query, Consumer<RankingDomainData> consumer) {
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
if (!blacklist.isBlacklisted(id)) {
consumer.accept(
new RankingDomainData(id,
rsp.getString(2),
rsp.getInt(3),
DomainIndexingState.valueOf(rsp.getString(4)),
rsp.getInt(5)));
}
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domains", ex);
}
}
public void eachDomainLink(DomainLinkConsumer consumer) {
var allLinks = queryClient.getAllDomainLinks();
var iter = allLinks.iterator();
while (iter.advance()) {
consumer.accept(iter.source(), iter.dest());
}
}
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement()) {
// This is sourced from a config file --v
var rsp = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE '" + pattern + "'");
while (rsp.next()) {
idConsumer.accept(rsp.getInt(1));
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domains by pattern", ex);
}
}
public interface DomainLinkConsumer {
void accept(int from, int to);
}
}

View File

@ -1,93 +0,0 @@
package nu.marginalia.ranking.data;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.query.client.QueryClient;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.function.Consumer;
@Singleton
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
final boolean hasData;
@Inject
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, QueryClient queryClient, DomainBlacklistImpl blacklist) {
super(dataSource, queryClient, blacklist);
hasData = isDomainNeighborTablePopulated(dataSource);
}
private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) {
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement();
var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) {
return rs.next();
}
catch (SQLException ex) {
LoggerFactory
.getLogger(RankingDomainFetcherForSimilarityData.class)
.error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex);
return false;
}
}
public boolean hasData() {
return hasData;
}
public void eachDomainLink(DomainLinkConsumer consumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2"))
{
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
// these "links" are bidi
consumer.accept(src, dst);
consumer.accept(dst, src);
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domain links", ex);
}
}
public void getDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query =
"""
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
GROUP BY EC_DOMAIN.ID
""";
}
else {
query =
"""
SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
GROUP BY EC_DOMAIN.ID
""";
}
getDomains(query, consumer);
}
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
// This is not relevant for this variant of pagerank since it is bidirectional
}
}

View File

@ -0,0 +1,65 @@
package nu.marginalia.ranking.data;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
import org.jgrapht.graph.DefaultWeightedEdge;
import java.sql.SQLException;
/** A source for the similarity graph, stored in EC_DOMAIN_NEIGHBORS_2,
* which contains the cosine similarity of the incident link vectors in the link graph.
* */
public class SimilarityGraphSource extends AbstractGraphSource {
@Inject
public SimilarityGraphSource(HikariDataSource dataSource) {
super(dataSource);
}
/** Check if the data source is available. */
public boolean isAvailable() {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT *
FROM EC_DOMAIN_NEIGHBORS_2
LIMIT 1
""");
var rs = stmt.executeQuery())
{
return rs.next();
}
catch (SQLException ex) {
return false;
}
}
@SneakyThrows
@Override
public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
addVertices(graph);
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement("""
SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS
FROM EC_DOMAIN_NEIGHBORS_2
"""))
{
var rs = stmt.executeQuery();
while (rs.next()) {
int src = rs.getInt(1);
int dest = rs.getInt(2);
double weight = rs.getDouble(3);
graph.addEdge(src, dest);
graph.setEdgeWeight(src, dest, weight);
}
}
}
return graph;
}
}

View File

@ -0,0 +1,375 @@
package nu.marginalia.ranking.jgrapht;
/*
* (C) Copyright 2016-2023, by Dimitrios Michail and Contributors.
*
*
* JGraphT : a free Java graph-theory library
*
* See the CONTRIBUTORS.md file distributed with this work for additional
* information regarding copyright ownership.
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0, or the
* GNU Lesser General Public License v2.1 or later
* which is available at
* http://www.gnu.org/licenses/old-licenses/lgpl-2.1-standalone.html.
*
* SPDX-License-Identifier: EPL-2.0 OR LGPL-2.1-or-later
*/
/* (modified by @vlofgren to add personalization) */
import org.jgrapht.*;
import org.jgrapht.alg.interfaces.*;
import java.util.*;
public class PersonalizedPageRank<V, E>
implements VertexScoringAlgorithm<V, Double>
{
/**
* Default number of maximum iterations.
*/
public static final int MAX_ITERATIONS_DEFAULT = 100;
/**
* Default value for the tolerance. The calculation will stop if the difference of PageRank
* values between iterations change less than this value.
*/
public static final double TOLERANCE_DEFAULT = 0.0001;
/**
* Damping factor default value.
*/
public static final double DAMPING_FACTOR_DEFAULT = 0.85d;
/**
* The input graph
*/
private final Graph<V, E> graph;
private final Collection<V> influenceSet;
/**
* The damping factor
*/
private final double dampingFactor;
/**
* Maximum iterations to run
*/
private final int maxIterations;
/**
* The calculation will stop if the difference of PageRank values between iterations change less
* than this value
*/
private final double tolerance;
/**
* The result
*/
private Map<V, Double> scores;
/**
* Create and execute an instance of Personalized PageRank.
*
* @param graph the input graph
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
*/
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet)
{
this(graph, influenceSet, DAMPING_FACTOR_DEFAULT, MAX_ITERATIONS_DEFAULT, TOLERANCE_DEFAULT);
}
/**
* Create and execute an instance of Personalized PageRank.
*
* @param graph the input graph
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
* @param dampingFactor the damping factor
*/
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor)
{
this(graph, influenceSet, dampingFactor, MAX_ITERATIONS_DEFAULT, TOLERANCE_DEFAULT);
}
/**
* Create and execute an instance of Personalized PageRank.
*
* @param graph the input graph
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
* @param dampingFactor the damping factor
* @param maxIterations the maximum number of iterations to perform
*/
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor, int maxIterations)
{
this(graph, influenceSet, dampingFactor, maxIterations, TOLERANCE_DEFAULT);
}
/**
* Create and execute an instance of Personalized PageRank.
*
* @param graph the input graph
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
* @param dampingFactor the damping factor
* @param maxIterations the maximum number of iterations to perform
* @param tolerance the calculation will stop if the difference of Personalized PageRank values between
* iterations change less than this value
*/
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor, int maxIterations, double tolerance)
{
this.graph = graph;
this.influenceSet = influenceSet;
if (maxIterations <= 0) {
throw new IllegalArgumentException("Maximum iterations must be positive");
}
this.maxIterations = maxIterations;
if (dampingFactor < 0.0 || dampingFactor > 1.0) {
throw new IllegalArgumentException("Damping factor not valid");
}
this.dampingFactor = dampingFactor;
if (tolerance <= 0.0) {
throw new IllegalArgumentException("Tolerance not valid, must be positive");
}
this.tolerance = tolerance;
}
/**
* {@inheritDoc}
*/
@Override
public Map<V, Double> getScores()
{
if (scores == null) {
scores = Collections.unmodifiableMap(new Algorithm().getScores());
}
return scores;
}
/**
* {@inheritDoc}
*/
@Override
public Double getVertexScore(V v)
{
if (!graph.containsVertex(v)) {
throw new IllegalArgumentException("Cannot return score of unknown vertex");
}
return getScores().get(v);
}
/**
* The actual implementation.
*
* <p>
* We use this pattern with the inner class in order to be able to cache the result but also
* allow the garbage collector to acquire all auxiliary memory used during the execution of the
* algorithm.
*
* @author Dimitrios Michail
*
* @param <V> the graph type
* @param <E> the edge type
*/
private class Algorithm
{
private int totalVertices;
private boolean isWeighted;
private Map<V, Integer> vertexIndexMap;
private V[] vertexMap;
private double[] weightSum;
private double[] curScore;
private double[] nextScore;
private int[] outDegree;
private ArrayList<int[]> adjList;
private ArrayList<double[]> weightsList;
private BitSet influenceIndexSet;
@SuppressWarnings("unchecked")
public Algorithm()
{
this.totalVertices = graph.vertexSet().size();
this.isWeighted = graph.getType().isWeighted();
/*
* Initialize score, map vertices to [0,n) and pre-compute degrees and adjacency lists
*/
this.curScore = new double[totalVertices];
this.nextScore = new double[totalVertices];
this.vertexIndexMap = new HashMap<>();
this.vertexMap = (V[]) new Object[totalVertices];
this.outDegree = new int[totalVertices];
this.adjList = new ArrayList<>(totalVertices);
this.influenceIndexSet = new BitSet(totalVertices);
double initScore = 1.0d / totalVertices;
int i = 0;
for (V v : graph.vertexSet()) {
vertexIndexMap.put(v, i);
vertexMap[i] = v;
outDegree[i] = graph.outDegreeOf(v);
curScore[i] = initScore;
if (influenceSet.contains(v)) {
influenceIndexSet.set(i);
}
i++;
}
if (isWeighted) {
this.weightSum = new double[totalVertices];
this.weightsList = new ArrayList<>(totalVertices);
for (i = 0; i < totalVertices; i++) {
V v = vertexMap[i];
int[] inNeighbors = new int[graph.inDegreeOf(v)];
double[] edgeWeights = new double[graph.inDegreeOf(v)];
int j = 0;
for (E e : graph.incomingEdgesOf(v)) {
V w = Graphs.getOppositeVertex(graph, e, v);
Integer mappedVertexId = vertexIndexMap.get(w);
inNeighbors[j] = mappedVertexId;
double edgeWeight = graph.getEdgeWeight(e);
edgeWeights[j] += edgeWeight;
weightSum[mappedVertexId] += edgeWeight;
j++;
}
weightsList.add(edgeWeights);
adjList.add(inNeighbors);
}
} else {
for (i = 0; i < totalVertices; i++) {
V v = vertexMap[i];
int[] inNeighbors = new int[graph.inDegreeOf(v)];
int j = 0;
for (E e : graph.incomingEdgesOf(v)) {
V w = Graphs.getOppositeVertex(graph, e, v);
inNeighbors[j++] = vertexIndexMap.get(w);
}
adjList.add(inNeighbors);
}
}
}
public Map<V, Double> getScores()
{
// compute
if (isWeighted) {
runWeighted();
} else {
run();
}
// make results user friendly
Map<V, Double> scores = new HashMap<>();
for (int i = 0; i < totalVertices; i++) {
V v = vertexMap[i];
scores.put(v, curScore[i]);
}
return scores;
}
private void run()
{
double maxChange = tolerance;
int iterations = maxIterations;
while (iterations > 0 && maxChange >= tolerance) {
double r = teleProp();
maxChange = 0d;
for (int i = 0; i < totalVertices; i++) {
double contribution = 0d;
for (int w : adjList.get(i)) {
contribution += dampingFactor * curScore[w] / outDegree[w];
}
double vOldValue = curScore[i];
double vNewValue = (influenceIndexSet.get(i) ? r : 0) + contribution;
maxChange = Math.max(maxChange, Math.abs(vNewValue - vOldValue));
nextScore[i] = vNewValue;
}
// progress
swapScores();
iterations--;
}
// remove influence factor from the scores
double r = teleProp();
for (int i = 0; i < totalVertices; i++) {
curScore[i] -= (influenceIndexSet.get(i) ? r : 0);
}
}
private void runWeighted()
{
double maxChange = tolerance;
int iterations = maxIterations;
while (iterations > 0 && maxChange >= tolerance) {
double r = teleProp();
maxChange = 0d;
for (int i = 0; i < totalVertices; i++) {
double contribution = 0d;
int[] neighbors = adjList.get(i);
double[] weights = weightsList.get(i);
for (int j = 0, getLength = neighbors.length; j < getLength; j++) {
int w = neighbors[j];
contribution += dampingFactor * curScore[w] * weights[j] / weightSum[w];
}
double vOldValue = curScore[i];
double vNewValue = (influenceIndexSet.get(i) ? r : 0) + contribution;
maxChange = Math.max(maxChange, Math.abs(vNewValue - vOldValue));
nextScore[i] = vNewValue;
}
// progress
swapScores();
iterations--;
}
// remove influence factor from the scores
double r = teleProp();
for (int i = 0; i < totalVertices; i++) {
curScore[i] -= (influenceIndexSet.get(i) ? r : 0);
}
}
// This is the teleportation part of the algorithm, and also what is modified to personalize the PageRank
private double teleProp()
{
double r = 0d;
for (int v = influenceIndexSet.nextSetBit(0);
v >= 0;
v = influenceIndexSet.nextSetBit(v + 1))
{
if (outDegree[v] > 0)
r += (1d - dampingFactor);
else
r += curScore[v];
}
return r / influenceSet.size();
}
private void swapScores()
{
double[] tmp = curScore;
curScore = nextScore;
nextScore = tmp;
}
}
}

View File

@ -0,0 +1,78 @@
package nu.marginalia.ranking;
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.util.List;
// Test the ranking algorithm with prod data. Will not run if the data is not available.
// It's not feasible to include the data in the git repo, as it's ~6 GB of data.
@Disabled
class RankingAlgorithmWithRealDataTest {
@Test
public void testRegularPR() {
if (!TestGraphSourceForLinkData.isAvailable()) {
return;
}
var graphSource = new TestGraphSourceForLinkData();
var results = new PageRankDomainRanker(graphSource, List.of())
.calculate(10, RankingResultListAccumulator::new);
for (int i = 0; i < results.size(); i++) {
System.out.println(i + " " + graphSource.getName(results.get(i)));
}
}
@Test
public void testInvertedLinkGraph() {
if (!TestGraphSourceForInvertedLinkData.isAvailable()) {
return;
}
var graphSource = new TestGraphSourceForInvertedLinkData();
var results = new PageRankDomainRanker(graphSource, List.of())
.calculate(10, RankingResultListAccumulator::new);
for (int i = 0; i < results.size(); i++) {
System.out.println(i + " " + graphSource.getName(results.get(i)));
}
}
@Test
public void testSimilarityPR() {
if (!TestGraphSourceForSimilarityData.isAvailable()) {
return;
}
var graphSource = new TestGraphSourceForSimilarityData();
var results = new PageRankDomainRanker(graphSource, List.of())
.calculate(10, RankingResultListAccumulator::new);
for (int i = 0; i < results.size(); i++) {
System.out.println(i + " " + graphSource.getName(results.get(i)));
}
}
@Test
public void testSimilarityPPR() {
if (!TestGraphSourceForSimilarityData.isAvailable()) {
return;
}
var graphSource = new TestGraphSourceForSimilarityData();
var results = new PageRankDomainRanker(graphSource,
List.of(1476552) // wiby.me
)
.calculate(10, RankingResultListAccumulator::new);
for (int i = 0; i < results.size(); i++) {
System.out.println(i + " " + graphSource.getName(results.get(i)));
}
}
}

View File

@ -0,0 +1,161 @@
package nu.marginalia.ranking;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.query.client.QueryClient;
import nu.marginalia.ranking.data.InvertedLinkGraphSource;
import nu.marginalia.ranking.data.LinkGraphSource;
import nu.marginalia.ranking.data.SimilarityGraphSource;
import nu.marginalia.test.TestMigrationLoader;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultWeightedEdge;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.parallel.Execution;
import org.mockito.Mockito;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.sql.SQLException;
import java.util.List;
import static org.junit.Assert.assertEquals;
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
import static org.mockito.Mockito.when;
@Tag("slow")
@Testcontainers
@Execution(SAME_THREAD)
public class RankingAlgorithmsContainerTest {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withNetworkAliases("mariadb");
static HikariDataSource dataSource;
QueryClient queryClient;
QueryClient.AllLinks allLinks;
@BeforeAll
public static void setup() {
HikariConfig config = new HikariConfig();
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
config.setUsername("wmsa");
config.setPassword("wmsa");
dataSource = new HikariDataSource(config);
TestMigrationLoader.flywayMigration(dataSource);
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement()) {
stmt.executeUpdate("""
INSERT INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
VALUES ('memex.marginalia.nu', 'marginalia.nu', 1),
('search.marginalia.nu', 'marginalia.nu', 1),
('encyclopedia.marginalia.nu', 'marginalia.nu', 1),
('marginalia.nu', 'marginalia.nu', 1);
""");
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
@BeforeEach
public void setupQueryClient() {
queryClient = Mockito.mock(QueryClient.class);
allLinks = new QueryClient.AllLinks();
when(queryClient.getAllDomainLinks()).thenReturn(allLinks);
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement()) {
stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_NEIGHBORS_2");
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
private void addSimilarity(int source, int dest, double similarity) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
INSERT INTO EC_DOMAIN_NEIGHBORS_2(DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS)
VALUES (?, ?, ?)
""")) {
stmt.setInt(1, source);
stmt.setInt(2, dest);
stmt.setDouble(3, similarity);
stmt.executeUpdate();
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
@Test
public void testGetDomains() {
// should all be the same, doesn't matter which one we use
var source = new LinkGraphSource(dataSource, queryClient);
Assertions.assertEquals(List.of(1),
source.domainIds(List.of("memex.marginalia.nu")));
// Verify globbing
Assertions.assertEquals(List.of(1,2,3),
source.domainIds(List.of("%.marginalia.nu")));
}
@Test
public void testLinkGraphSource() {
allLinks.add(1, 3);
var graph = new LinkGraphSource(dataSource, queryClient).getGraph();
Assertions.assertTrue(graph.containsVertex(1));
Assertions.assertTrue(graph.containsVertex(2));
Assertions.assertTrue(graph.containsVertex(3));
Assertions.assertTrue(graph.containsEdge(1, 3));
Assertions.assertFalse(graph.containsEdge(3, 1));
Assertions.assertFalse(graph.containsEdge(2, 3));
Assertions.assertFalse(graph.containsEdge(3, 2));
}
@Test
public void testInvertedLinkGraphSource() {
allLinks.add(1, 3);
var graph = new InvertedLinkGraphSource(dataSource, queryClient).getGraph();
Assertions.assertTrue(graph.containsVertex(1));
Assertions.assertTrue(graph.containsVertex(2));
Assertions.assertTrue(graph.containsVertex(3));
Assertions.assertTrue(graph.containsEdge(3, 1));
Assertions.assertFalse(graph.containsEdge(1, 3));
Assertions.assertFalse(graph.containsEdge(2, 3));
Assertions.assertFalse(graph.containsEdge(3, 2));
}
@Test
@SuppressWarnings("unchecked")
public void testSimilarityGraphSource() {
addSimilarity(1, 3, 0.5);
var graph = (Graph<Integer, DefaultWeightedEdge>) new SimilarityGraphSource(dataSource).getGraph();
Assertions.assertTrue(graph.containsVertex(1));
Assertions.assertTrue(graph.containsVertex(2));
Assertions.assertTrue(graph.containsVertex(3));
Assertions.assertTrue(graph.containsEdge(3, 1));
Assertions.assertTrue(graph.containsEdge(1, 3));
Assertions.assertEquals(graph.getEdgeWeight(graph.getEdge(1, 3)), 0.5, 0.0001);
Assertions.assertFalse(graph.containsEdge(1, 2));
Assertions.assertFalse(graph.containsEdge(2, 3));
}
}

View File

@ -0,0 +1,86 @@
package nu.marginalia.ranking;
import lombok.SneakyThrows;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ranking.data.GraphSource;
import org.apache.commons.lang3.StringUtils;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class TestGraphSourceForInvertedLinkData implements GraphSource {
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
private static Path[] linksDataPaths = new Path[] {
Paths.get("/home/vlofgren/Exports/Links/domain-links-1.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-2.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-3.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-4.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-5.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-6.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-7.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-8.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-9.dat")
};
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
static boolean isAvailable() {
return Files.exists(domainDataPath) && Files.exists(linksDataPaths[0]);
}
private Map<Integer, String> idToName = new HashMap<>();
public String getName(int id) {
return idToName.get(id);
}
@SneakyThrows
@Override
public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
idToName = new HashMap<>();
try (var stream = Files
.lines(domainDataPath)) {
stream.skip(1)
.mapMultiToInt((line, c) -> {
String[] parts = StringUtils.split(line, '\t');
int id = Integer.parseInt(parts[0]);
String name = parts[1];
int node_affinity = Integer.parseInt(parts[3]);
if (node_affinity > 0) {
c.accept(id);
idToName.put(id, parts[1]);
}
})
.forEach(graph::addVertex);
}
for (var path : linksDataPaths) {
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
data.forEach(0, data.size(), (pos, val) -> {
val = Long.reverseBytes(val); // data is in "java endian", LongArray is in "C endian"
int src = (int) (val >>> 32);
int dest = (int) (val & 0xFFFF_FFFFL);
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
graph.addEdge(dest, src);
}
});
}
}
return graph;
}
}

View File

@ -0,0 +1,86 @@
package nu.marginalia.ranking;
import lombok.SneakyThrows;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ranking.data.GraphSource;
import org.apache.commons.lang3.StringUtils;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class TestGraphSourceForLinkData implements GraphSource {
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
private static Path[] linksDataPaths = new Path[] {
Paths.get("/home/vlofgren/Exports/Links/domain-links-1.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-2.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-3.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-4.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-5.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-6.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-7.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-8.dat"),
Paths.get("/home/vlofgren/Exports/Links/domain-links-9.dat")
};
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
static boolean isAvailable() {
return Files.exists(domainDataPath) && Files.exists(linksDataPaths[0]);
}
private Map<Integer, String> idToName = new HashMap<>();
public String getName(int id) {
return idToName.get(id);
}
@SneakyThrows
@Override
public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
idToName = new HashMap<>();
try (var stream = Files
.lines(domainDataPath)) {
stream.skip(1)
.mapMultiToInt((line, c) -> {
String[] parts = StringUtils.split(line, '\t');
int id = Integer.parseInt(parts[0]);
String name = parts[1];
int node_affinity = Integer.parseInt(parts[3]);
if (node_affinity > 0) {
c.accept(id);
idToName.put(id, parts[1]);
}
})
.forEach(graph::addVertex);
}
for (var path : linksDataPaths) {
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
data.forEach(0, data.size(), (pos, val) -> {
val = Long.reverseBytes(val); // data is in "java endian", LongArray is in "C endian"
int src = (int) (val >>> 32);
int dest = (int) (val & 0xFFFF_FFFFL);
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
graph.addEdge(src, dest);
}
});
}
}
return graph;
}
}

View File

@ -0,0 +1,78 @@
package nu.marginalia.ranking;
import lombok.SneakyThrows;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ranking.data.GraphSource;
import org.apache.commons.lang3.StringUtils;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
import org.jgrapht.graph.DefaultWeightedEdge;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class TestGraphSourceForSimilarityData implements GraphSource {
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
private static Path similarityDataPath = Paths.get("/home/vlofgren/Exports/Links/neighbors.tsv");
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
static boolean isAvailable() {
return Files.exists(domainDataPath) && Files.exists(similarityDataPath);
}
private Map<Integer, String> idToName = new HashMap<>();
public String getName(int id) {
return idToName.get(id);
}
@SneakyThrows
@Override
public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
idToName = new HashMap<>();
try (var stream = Files
.lines(domainDataPath)) {
stream.skip(1)
.mapMultiToInt((line, c) -> {
String[] parts = StringUtils.split(line, '\t');
int id = Integer.parseInt(parts[0]);
String name = parts[1];
int node_affinity = Integer.parseInt(parts[3]);
if (node_affinity > 0) {
c.accept(id);
idToName.put(id, name);
}
})
.forEach(graph::addVertex);
}
try (var stream = Files
.lines(similarityDataPath)) {
stream.skip(1)
.forEach(line -> {
String[] parts = StringUtils.split(line, '\t');
int src = Integer.parseInt(parts[0]);
int dest = Integer.parseInt(parts[1]);
double weight = Double.parseDouble(parts[2]);
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
graph.addEdge(src, dest);
graph.setEdgeWeight(src, dest, weight);
}
});
}
return graph;
}
}

View File

@ -15,15 +15,12 @@ import java.sql.SQLException;
import java.util.Map;
public class ControlDomainRankingSetsService {
private final HikariDataSource dataSource;
private final ControlRendererFactory rendererFactory;
private final DomainRankingSetsService domainRankingSetsService;
@Inject
public ControlDomainRankingSetsService(HikariDataSource dataSource,
ControlRendererFactory rendererFactory,
public ControlDomainRankingSetsService(ControlRendererFactory rendererFactory,
DomainRankingSetsService domainRankingSetsService) {
this.dataSource = dataSource;
this.rendererFactory = rendererFactory;
this.domainRankingSetsService = domainRankingSetsService;
}
@ -47,7 +44,6 @@ public class ControlDomainRankingSetsService {
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
id,
request.queryParams("description"),
DomainRankingSetsService.DomainSetAlgorithm.valueOf(request.queryParams("algorithm")),
Integer.parseInt(request.queryParams("depth")),
request.queryParams("definition")
));
@ -77,7 +73,6 @@ public class ControlDomainRankingSetsService {
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
request.queryParams("name").toUpperCase(),
request.queryParams("description"),
DomainRankingSetsService.DomainSetAlgorithm.valueOf(request.queryParams("algorithm")),
Integer.parseInt(request.queryParams("depth")),
request.queryParams("definition")
));
@ -95,17 +90,6 @@ public class ControlDomainRankingSetsService {
}
private Object rankingSetModel(Request request, Response response) throws SQLException {
var model = domainRankingSetsService.get(request.params("id")).orElseThrow();
return Map.of("rankingSet", model,
"selectedAlgo", Map.of(
"special", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.SPECIAL,
"adjacency_cheirank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_CHEIRANK,
"adjacency_pagerank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_PAGERANK,
"links_cheirank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.LINKS_CHEIRANK,
"links_pagerank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.LINKS_PAGERANK)
);
return Map.of("rankingSet", model);
}
}

View File

@ -16,14 +16,12 @@
<tr>
<th>Name</th>
<th>Description</th>
<th>Algorithm</th>
<th>Depth</th>
</tr>
{{#each rankingSets}}
<tr>
<td><a href="/domain-ranking-sets/{{name}}">{{name}}</td></td>
<td>{{description}}</td>
<td>{{algorithm}}</td>
<td>{{depth}}</td>
</tr>
{{/each}}

View File

@ -21,23 +21,6 @@
</div>
</td>
</tr>
<tr>
<th><label for="algorithm">Algorithm</label></th>
<td>
<select id="algorithm" name="algorithm">
<option value="LINKS_PAGERANK">LINKS_PAGERANK</option>
<option value="LINKS_CHEIRANK">LINKS_CHEIRANK</option>
<option value="ADJACENCY_PAGERANK">ADJACENCY_PAGERANK</option>
<option value="ADJACENCY_CHEIRANK">ADJACENCY_CHEIRANK</option>
</select>
<div>
<small class="text-muted">
The algorithm used to rank the domains. The LINKS algorithms use the link graph, and the ADJACENCY
algorithms use the adjacency graph. CheiRank is a variant of PageRank that uses the reversed graph.
</small>
</div>
</td>
</tr>
<tr>
<th><label for="description">Description</label></th>
<td>
@ -61,8 +44,12 @@
<textarea name="definition" id="definition" rows="10" style="width: 100%">{{definition}}</textarea>
<div>
<small class="text-muted">A list of domain names, one per line, possibly globbed with SQL-style '%' wildcards.
These are used as the origin point for the Personalized PageRank algorithm, and will be considered
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used, as per the PageRank paper.
If provided, these are used as the origin point for the Personalized PageRank algorithm, and will be considered
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used,
as per the PageRank paper.
<br><br>
If similarity data is available and domains are specified, the similarity data is used as basis for the ranking
calculation instead, providing a much more coherent ranking.
</small>
</div>
</td></tr>

View File

@ -22,27 +22,6 @@
</div>
</td>
</tr>
<tr>
<th><label for="algorithm">Algorithm</label></th>
<td>
{{#if special}}<input type="hidden" name="algorithm" value="{{algorithm}}" />{{/if}}
<select id="algorithm" name="algorithm" {{#if special}}disabled{{/if}}>
{{#with algorithm}}
<option value="SPECIAL" disabled {{#if selectedAlgo.special}}selected{{/if}}>SPECIAL</option>
<option value="LINKS_PAGERANK" {{#if selectedAlgo.links_pagerank}}selected{{/if}}>LINKS_PAGERANK</option>
<option value="LINKS_CHEIRANK" {{#if selectedAlgo.links_cheirank}}selected{{/if}}>LINKS_CHEIRANK</option>
<option value="ADJACENCY_PAGERANK" {{#if selectedAlgo.adjacency_pagerank}}selected{{/if}}>ADJACENCY_PAGERANK</option>
<option value="ADJACENCY_CHEIRANK" {{#if selectedAlgo.adjacency_cheirank}}selected{{/if}}>ADJACENCY_CHEIRANK</option>
{{/with}}
</select>
<div>
<small class="text-muted">
The algorithm used to rank the domains. The LINKS algorithms use the link graph, and the ADJACENCY
algorithms use the adjacency graph. CheiRank is a variant of PageRank that uses the reversed graph.
</small>
</div>
</td>
</tr>
<tr>
<th><label for="description">Description</label></th>
<td>
@ -67,8 +46,12 @@
<textarea name="definition" id="definition" rows="10" style="width: 100%">{{definition}}</textarea>
<div>
<small class="text-muted">A list of domain names, one per line, possibly globbed with SQL-style '%' wildcards.
These are used as the origin point for the Personalized PageRank algorithm, and will be considered
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used, as per the PageRank paper.
If provided, these are used as the origin point for the Personalized PageRank algorithm, and will be considered
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used,
as per the PageRank paper.
<br><br>
If similarity data is available and domains are specified, the similarity data is used as basis for the ranking
calculation instead, providing a much more coherent ranking.
</small>
</div>
</td></tr>

View File

@ -8,25 +8,23 @@ import nu.marginalia.db.DomainRankingSetsService;
import nu.marginalia.db.DomainTypes;
import nu.marginalia.index.IndexServicesFactory;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.ranking.RankingAlgorithm;
import nu.marginalia.ranking.ReversePageRank;
import nu.marginalia.ranking.StandardPageRank;
import nu.marginalia.ranking.*;
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
import nu.marginalia.ranking.accumulator.RankingResultHashSetAccumulator;
import nu.marginalia.ranking.data.RankingDomainFetcher;
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
import nu.marginalia.index.svc.searchset.RankingSearchSet;
import nu.marginalia.index.svc.searchset.SearchSetAny;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.index.db.DbUpdateRanks;
import nu.marginalia.ranking.data.GraphSource;
import nu.marginalia.ranking.data.LinkGraphSource;
import nu.marginalia.ranking.data.SimilarityGraphSource;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
@ -34,13 +32,12 @@ import java.util.concurrent.ConcurrentHashMap;
public class IndexSearchSetsService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final DomainTypes domainTypes;
private final ServiceHeartbeat heartbeat;
private final IndexServicesFactory indexServicesFactory;
private final ServiceEventLog eventLog;
private final DomainRankingSetsService domainRankingSetsService;
private final DbUpdateRanks dbUpdateRanks;
private final RankingDomainFetcher similarityDomains;
private final RankingDomainFetcher linksDomains;
private final GraphSource similarityDomains;
private final GraphSource linksDomains;
private final ConcurrentHashMap<String, SearchSet> rankingSets = new ConcurrentHashMap<>();
// Below are binary indices that are used to constrain a search
@ -55,23 +52,21 @@ public class IndexSearchSetsService {
@Inject
public IndexSearchSetsService(DomainTypes domainTypes,
ServiceConfiguration serviceConfiguration,
ServiceHeartbeat heartbeat,
RankingDomainFetcher rankingDomains,
RankingDomainFetcherForSimilarityData similarityDomains,
LinkGraphSource rankingDomains,
SimilarityGraphSource similarityDomains,
IndexServicesFactory indexServicesFactory,
ServiceEventLog eventLog,
DomainRankingSetsService domainRankingSetsService,
DbUpdateRanks dbUpdateRanks) throws IOException {
this.nodeId = serviceConfiguration.node();
this.domainTypes = domainTypes;
this.heartbeat = heartbeat;
this.indexServicesFactory = indexServicesFactory;
this.eventLog = eventLog;
this.domainRankingSetsService = domainRankingSetsService;
this.dbUpdateRanks = dbUpdateRanks;
if (similarityDomains.hasData()) {
if (similarityDomains.isAvailable()) {
this.similarityDomains = similarityDomains;
this.linksDomains = rankingDomains;
}
@ -126,13 +121,13 @@ public class IndexSearchSetsService {
}
try {
if (DomainRankingSetsService.DomainSetAlgorithm.SPECIAL.equals(rankingSet.algorithm())) {
if (rankingSet.isSpecial()) {
switch (rankingSet.name()) {
case "BLOGS" -> recalculateBlogsSet(rankingSet);
case "NONE" -> {} // No-op
}
} else {
recalculateNornal(rankingSet);
recalculateNormal(rankingSet);
}
}
catch (Exception ex) {
@ -142,18 +137,18 @@ public class IndexSearchSetsService {
}
}
private void recalculateNornal(DomainRankingSetsService.DomainRankingSet rankingSet) {
String[] domains = rankingSet.domains();
private void recalculateNormal(DomainRankingSetsService.DomainRankingSet rankingSet) {
List<String> domains = List.of(rankingSet.domains());
RankingAlgorithm rankingAlgorithm = switch (rankingSet.algorithm()) {
case LINKS_PAGERANK -> new StandardPageRank(linksDomains, domains);
case LINKS_CHEIRANK -> new ReversePageRank(linksDomains, domains);
case ADJACENCY_PAGERANK -> new StandardPageRank(similarityDomains, domains);
case ADJACENCY_CHEIRANK -> new ReversePageRank(similarityDomains, domains);
default -> throw new IllegalStateException("Unexpected value: " + rankingSet.algorithm());
};
GraphSource source;
var data = rankingAlgorithm.pageRankWithPeripheralNodes(rankingSet.depth(), RankingResultHashSetAccumulator::new);
// Similarity ranking does not behave well with an empty set of domains
if (domains.isEmpty()) source = linksDomains;
else source = similarityDomains;
var data = PageRankDomainRanker
.forDomainNames(source, domains)
.calculate(rankingSet.depth(), RankingResultHashSetAccumulator::new);
var set = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), data);
rankingSets.put(rankingSet.name(), set);
@ -185,9 +180,21 @@ public class IndexSearchSetsService {
}
private void updateDomainRankings(DomainRankingSetsService.DomainRankingSet rankingSet) {
List<String> domains = List.of(rankingSet.domains());
var spr = new StandardPageRank(similarityDomains, rankingSet.domains());
var ranks = spr.pageRankWithPeripheralNodes(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
final GraphSource source;
if (domains.isEmpty()) {
// Similarity ranking does not behave well with an empty set of domains
source = linksDomains;
}
else {
source = similarityDomains;
}
var ranks = PageRankDomainRanker
.forDomainNames(source, domains)
.calculate(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
synchronized (this) {
domainRankings = new DomainRankings(ranks);