Merge pull request #80 from MarginaliaSearch/ranking-algorithms
Clean up domain ranking code
This commit is contained in:
commit
d05c916491
@ -25,7 +25,7 @@ public class DomainRankingSetsService {
|
||||
public Optional<DomainRankingSet> get(String name) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION
|
||||
SELECT NAME, DESCRIPTION, DEPTH, DEFINITION
|
||||
FROM CONF_DOMAIN_RANKING_SET
|
||||
WHERE NAME = ?
|
||||
""")) {
|
||||
@ -39,7 +39,6 @@ public class DomainRankingSetsService {
|
||||
return Optional.of(new DomainRankingSet(
|
||||
rs.getString("NAME"),
|
||||
rs.getString("DESCRIPTION"),
|
||||
DomainSetAlgorithm.valueOf(rs.getString("ALGORITHM")),
|
||||
rs.getInt("DEPTH"),
|
||||
rs.getString("DEFINITION")
|
||||
));
|
||||
@ -53,15 +52,14 @@ public class DomainRankingSetsService {
|
||||
public void upsert(DomainRankingSet domainRankingSet) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
REPLACE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
REPLACE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, DEPTH, DEFINITION)
|
||||
VALUES (?, ?, ?, ?)
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, domainRankingSet.name());
|
||||
stmt.setString(2, domainRankingSet.description());
|
||||
stmt.setString(3, domainRankingSet.algorithm().name());
|
||||
stmt.setInt(4, domainRankingSet.depth());
|
||||
stmt.setString(5, domainRankingSet.definition());
|
||||
stmt.setInt(3, domainRankingSet.depth());
|
||||
stmt.setString(4, domainRankingSet.definition());
|
||||
stmt.executeUpdate();
|
||||
|
||||
if (!conn.getAutoCommit())
|
||||
@ -94,7 +92,7 @@ public class DomainRankingSetsService {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION
|
||||
SELECT NAME, DESCRIPTION, DEPTH, DEFINITION
|
||||
FROM CONF_DOMAIN_RANKING_SET
|
||||
""")) {
|
||||
var rs = stmt.executeQuery();
|
||||
@ -105,7 +103,6 @@ public class DomainRankingSetsService {
|
||||
new DomainRankingSet(
|
||||
rs.getString("NAME"),
|
||||
rs.getString("DESCRIPTION"),
|
||||
DomainSetAlgorithm.valueOf(rs.getString("ALGORITHM")),
|
||||
rs.getInt("DEPTH"),
|
||||
rs.getString("DEFINITION"))
|
||||
);
|
||||
@ -118,31 +115,17 @@ public class DomainRankingSetsService {
|
||||
}
|
||||
}
|
||||
|
||||
public enum DomainSetAlgorithm {
|
||||
/** Use link graph, do a pagerank */
|
||||
LINKS_PAGERANK,
|
||||
/** Use link graph, do a cheirank */
|
||||
LINKS_CHEIRANK,
|
||||
/** Use adjacency graph, do a pagerank */
|
||||
ADJACENCY_PAGERANK,
|
||||
/** Use adjacency graph, do a cheirank */
|
||||
ADJACENCY_CHEIRANK,
|
||||
/** For reserved names. Use special algorithm, function of name */
|
||||
SPECIAL
|
||||
};
|
||||
|
||||
/** Defines a domain ranking set, parameters for the ranking algorithms.
|
||||
*
|
||||
* @param name Key and name of the set
|
||||
* @param description Human-readable description
|
||||
* @param algorithm Algorithm to use
|
||||
* @param depth Depth of the algorithm
|
||||
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
||||
* */
|
||||
@With
|
||||
public record DomainRankingSet(String name,
|
||||
String description,
|
||||
DomainSetAlgorithm algorithm,
|
||||
int depth,
|
||||
String definition)
|
||||
{
|
||||
@ -159,7 +142,7 @@ public class DomainRankingSetsService {
|
||||
}
|
||||
|
||||
public boolean isSpecial() {
|
||||
return algorithm() == DomainSetAlgorithm.SPECIAL;
|
||||
return name().equals("BLOGS") || name().equals("NONE") || name().equals("RANK");
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1 @@
|
||||
ALTER TABLE CONF_DOMAIN_RANKING_SET DROP COLUMN ALGORITHM;
|
@ -56,14 +56,12 @@ class DomainRankingSetsServiceTest {
|
||||
var newValue = new DomainRankingSetsService.DomainRankingSet(
|
||||
"test",
|
||||
"Test domain set",
|
||||
DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_CHEIRANK,
|
||||
10,
|
||||
"test\\.nu"
|
||||
);
|
||||
var newValue2 = new DomainRankingSetsService.DomainRankingSet(
|
||||
"test2",
|
||||
"Test domain set 2",
|
||||
DomainRankingSetsService.DomainSetAlgorithm.LINKS_PAGERANK,
|
||||
20,
|
||||
"test\\.nu 2"
|
||||
);
|
||||
|
@ -20,6 +20,8 @@ dependencies {
|
||||
implementation project(':code:common:service-client')
|
||||
implementation project(':code:api:query-api')
|
||||
|
||||
implementation 'org.jgrapht:jgrapht-core:1.5.2'
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.guice
|
||||
@ -27,8 +29,17 @@ dependencies {
|
||||
implementation libs.roaringbitmap
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
implementation libs.hll
|
||||
|
||||
testImplementation project(':code:libraries:array')
|
||||
|
||||
testImplementation libs.commons.lang3
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
}
|
@ -1,19 +1,34 @@
|
||||
# Domain Ranking
|
||||
|
||||
Contains domain ranking algorithms.
|
||||
Contains domain ranking algorithms. The domain ranking algorithms are based on
|
||||
the JGraphT library.
|
||||
|
||||
Two principal algorithms are available, the standard PageRank algorithm,
|
||||
and personalized pagerank; each are available for two graphs, the link graph
|
||||
and a similarity graph where each edge corresponds to the similarity between
|
||||
the sets of incident links to two domains, their cosine similarity acting as
|
||||
the weight of the links.
|
||||
|
||||
With the standard PageRank algorithm, the similarity graph does not produce
|
||||
anything useful, but something magical happens when you apply Personalized PageRank
|
||||
to this graph. It turns into a very good "vibe"-sensitive ranking algorithm.
|
||||
|
||||
It's unclear if this is a well known result, but it's a very interesting one
|
||||
for creating a ranking algorithm that is focused on a particular segment of the web.
|
||||
|
||||
## Central Classes
|
||||
|
||||
### Algorithms
|
||||
* [RankingAlgorithm](src/main/java/nu/marginalia/ranking/RankingAlgorithm.java)
|
||||
* [StandardPageRank](src/main/java/nu/marginalia/ranking/StandardPageRank.java)
|
||||
* [ReversePageRank](src/main/java/nu/marginalia/ranking/ReversePageRank.java) "CheiRank"
|
||||
* [PageRankDomainRanker](src/main/java/nu/marginalia/ranking/PageRankDomainRanker.java) - Ranks domains using the
|
||||
PageRank or Personalized PageRank algorithm depending on whether a list of influence domains is provided.
|
||||
|
||||
### Data sources
|
||||
|
||||
* [RankingDomainFetcher](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java) fetches link data.
|
||||
* [RankingDomainFetcherForSimilarityData](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java) fetches website similarity data.
|
||||
* [LinkGraphSource](src/main/java/nu/marginalia/ranking/data/LinkGraphSource.java) - fetches the link graph
|
||||
* [InvertedLinkGraphSource](src/main/java/nu/marginalia/ranking/data/InvertedLinkGraphSource.java) - fetches the inverted link graph
|
||||
* [SimilarityGraphSource](src/main/java/nu/marginalia/ranking/data/SimilarityGraphSource.java) - fetches the similarity graph from the database
|
||||
|
||||
Note that the similarity graph needs to be precomputed and stored in the database for
|
||||
the similarity graph source to be available.
|
||||
|
||||
## See Also
|
||||
|
||||
|
@ -0,0 +1,60 @@
|
||||
package nu.marginalia.ranking;
|
||||
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.ranking.accumulator.RankingResultAccumulator;
|
||||
import nu.marginalia.ranking.data.GraphSource;
|
||||
import nu.marginalia.ranking.jgrapht.PersonalizedPageRank;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.alg.interfaces.VertexScoringAlgorithm;
|
||||
import org.jgrapht.alg.scoring.PageRank;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
public class PageRankDomainRanker implements RankingAlgorithm {
|
||||
private final List<Integer> influenceSet;
|
||||
private final Graph<Integer, ?> graph;
|
||||
|
||||
public PageRankDomainRanker(GraphSource source,
|
||||
List<Integer> influenceSet)
|
||||
{
|
||||
this.influenceSet = influenceSet;
|
||||
this.graph = source.getGraph();
|
||||
}
|
||||
|
||||
public static PageRankDomainRanker forDomainNames(GraphSource source,
|
||||
List<String> influenceSet)
|
||||
{
|
||||
return new PageRankDomainRanker(source, source.domainIds(influenceSet));
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T calculate(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||
VertexScoringAlgorithm<Integer, Double> pageRank;
|
||||
|
||||
if (influenceSet != null && !influenceSet.isEmpty()) {
|
||||
pageRank = new PersonalizedPageRank<>(graph, influenceSet);
|
||||
}
|
||||
else {
|
||||
pageRank = new PageRank<>(graph);
|
||||
}
|
||||
|
||||
TIntList results = new TIntArrayList(resultCount);
|
||||
pageRank.getScores().entrySet()
|
||||
.stream()
|
||||
.sorted(Comparator.comparing((Map.Entry<Integer, Double> e) -> -e.getValue()))
|
||||
.limit(resultCount)
|
||||
.map(Map.Entry::getKey)
|
||||
.forEach(results::add);
|
||||
|
||||
var accumulator = accumulatorP.get();
|
||||
for (int i = 0; i < results.size(); i++) {
|
||||
accumulator.add(results.get(i), i);
|
||||
}
|
||||
return accumulator.get();
|
||||
}
|
||||
|
||||
}
|
@ -1,281 +1,15 @@
|
||||
package nu.marginalia.ranking;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
import nu.marginalia.ranking.accumulator.RankingResultAccumulator;
|
||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
||||
import nu.marginalia.ranking.data.RankingDomainData;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public abstract class RankingAlgorithm {
|
||||
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
||||
protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
|
||||
protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
|
||||
|
||||
protected TIntArrayList[] linkDataSrc2Dest;
|
||||
protected TIntArrayList[] linkDataDest2Src;
|
||||
|
||||
public final Set<String> originDomains = new HashSet<>();
|
||||
public final Set<Integer> originDomainIds = new HashSet<>();
|
||||
|
||||
private int maxKnownUrls = Integer.MAX_VALUE;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final RankingDomainFetcher domains;
|
||||
|
||||
public RankingAlgorithm(RankingDomainFetcher domains, String... origins) {
|
||||
this.domains = domains;
|
||||
|
||||
originDomains.addAll(Arrays.asList(origins));
|
||||
|
||||
domains.getDomains(domainData -> {
|
||||
int id = domainData.id;
|
||||
|
||||
domainsById.put(id, domainData);
|
||||
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
});
|
||||
|
||||
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
|
||||
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
|
||||
|
||||
domains.eachDomainLink((src, dst) -> {
|
||||
if (src == dst) return;
|
||||
|
||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||
|
||||
int srcIdx = domainIdToIndex.get(src);
|
||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||
|
||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||
|
||||
if (linkDataDest2Src[dstIdx] == null) {
|
||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||
}
|
||||
});
|
||||
|
||||
for (var namePattern : this.originDomains) {
|
||||
domains.domainsByPattern(namePattern, i -> {
|
||||
int ival = domainIdToIndex.get(i);
|
||||
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
|
||||
originDomainIds.add(ival);
|
||||
}
|
||||
else {
|
||||
logger.debug("No value for {}", i);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
logger.info("Origin Domains: {}", originDomainIds.size());
|
||||
}
|
||||
|
||||
public RankingDomainData getDomainData(int id) {
|
||||
return domainsById.get(id);
|
||||
}
|
||||
|
||||
public void addPeripheralNodes() {
|
||||
|
||||
int newNodesIdxCutoff = domainIdToIndex.size();
|
||||
|
||||
logger.info("Inserting peripheral nodes");
|
||||
|
||||
domains.getPeripheralDomains(domainData -> {
|
||||
int id = domainData.id;
|
||||
|
||||
if (domainsById.put(id, domainData) == null) { // true if id was not already present
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
}
|
||||
});
|
||||
|
||||
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
|
||||
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
|
||||
|
||||
domains.eachDomainLink((src, dst) -> {
|
||||
if (src == dst) return;
|
||||
|
||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||
int srcIdx = domainIdToIndex.get(src);
|
||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||
|
||||
// This looks like a bug, but it improves the results
|
||||
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
|
||||
return;
|
||||
|
||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||
|
||||
if (linkDataDest2Src[dstIdx] == null) {
|
||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||
}
|
||||
});
|
||||
|
||||
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return domainsById.size();
|
||||
}
|
||||
|
||||
public <T> T pageRank(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
for (int i = 0; i < iter_max; i++) {
|
||||
RankVector newRank = createNewRankVector(rank);
|
||||
|
||||
double oldNorm = rank.norm();
|
||||
double newNorm = newRank.norm();
|
||||
double dNorm = oldNorm - newNorm;
|
||||
|
||||
if (i < iter_max-1) {
|
||||
adjustRankVector(newRank, dNorm, oldNorm);
|
||||
}
|
||||
|
||||
rank = newRank;
|
||||
}
|
||||
|
||||
|
||||
return rank.getRanking(resultCount, accumulatorP).get();
|
||||
}
|
||||
|
||||
public <T> T pageRankWithPeripheralNodes(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
|
||||
for (int i = 0; i < iter_max; i++) {
|
||||
if (i == iter_max-1) {
|
||||
addPeripheralNodes();
|
||||
}
|
||||
RankVector newRank = createNewRankVector(rank);
|
||||
|
||||
double oldNorm = rank.norm();
|
||||
double newNorm = newRank.norm();
|
||||
double dNorm = oldNorm - newNorm;
|
||||
|
||||
if (i < iter_max-1) {
|
||||
adjustRankVector(newRank, dNorm, oldNorm);
|
||||
}
|
||||
|
||||
rank = newRank;
|
||||
}
|
||||
|
||||
logger.info("PRWPN iteration done");
|
||||
|
||||
return rank.getRanking(resultCount, accumulatorP).get();
|
||||
}
|
||||
|
||||
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
|
||||
|
||||
abstract RankVector createNewRankVector(RankVector rank);
|
||||
|
||||
public boolean includeInRanking(RankingDomainData data) {
|
||||
if (data.isAlias())
|
||||
return false;
|
||||
if (data.isSpecial())
|
||||
return false;
|
||||
if (data.isSocialMedia())
|
||||
return false;
|
||||
if (data.knownUrls > maxKnownUrls)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public void setMaxKnownUrls(int maxKnownUrls) {
|
||||
this.maxKnownUrls = maxKnownUrls;
|
||||
}
|
||||
public class RankVector {
|
||||
private final double[] rank;
|
||||
|
||||
public RankVector(double defaultValue) {
|
||||
rank = new double[domainIndexToId.size()];
|
||||
if (defaultValue != 0.) {
|
||||
Arrays.fill(rank, defaultValue);
|
||||
}
|
||||
}
|
||||
|
||||
public void set(int id, double value) {
|
||||
rank[id] = value;
|
||||
}
|
||||
|
||||
public void increment(int id, double value) {
|
||||
rank[id] += value;
|
||||
}
|
||||
|
||||
public double get(int id) {
|
||||
if (id >= rank.length) return 0.;
|
||||
|
||||
return rank[id];
|
||||
}
|
||||
|
||||
public double norm() {
|
||||
double v = 0.;
|
||||
for (double value : rank) {
|
||||
v += Math.abs(value);
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
public double norm(RankVector other) {
|
||||
double v = 0.;
|
||||
for (int i = 0; i < rank.length; i++) {
|
||||
v += Math.abs(rank[i] - other.get(i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
public <T> RankingResultAccumulator<T> getRanking(int numResults, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||
|
||||
if (numResults <= 0) {
|
||||
numResults = domainIdToIndex.size();
|
||||
}
|
||||
numResults = min(numResults, min(domainIdToIndex.size(), rank.length));
|
||||
|
||||
int[] nodes = sortOrder(rank);
|
||||
var accumulator = accumulatorP.get();
|
||||
|
||||
for (int i = 0; i < numResults; i++) {
|
||||
int id = domainIndexToId.get(nodes[i]);
|
||||
|
||||
if (includeInRanking(domainsById.get(id)))
|
||||
accumulator.add(id, i);
|
||||
}
|
||||
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
private static int[] sortOrder(double[] values) {
|
||||
|
||||
int[] ret = new int[values.length];
|
||||
Arrays.setAll(ret, i->i);
|
||||
IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i]));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public interface RankingAlgorithm {
|
||||
|
||||
/** Calculate domain rankings.
|
||||
*
|
||||
* @param resultCount update the best result count results
|
||||
* @param accumulatorP the accumulator to use to store the results
|
||||
*/
|
||||
<T> T calculate(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP);
|
||||
}
|
||||
|
@ -1,42 +0,0 @@
|
||||
package nu.marginalia.ranking;
|
||||
|
||||
|
||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
||||
|
||||
public class ReversePageRank extends RankingAlgorithm {
|
||||
|
||||
|
||||
public ReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
RankVector createNewRankVector(RankVector rank) {
|
||||
|
||||
double rankNorm = rank.norm();
|
||||
RankVector newRank = new RankVector(0);
|
||||
|
||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
||||
|
||||
var links = linkDataSrc2Dest[domainId];
|
||||
double newRankValue = 0;
|
||||
|
||||
if (links != null && links.size() > 0) {
|
||||
for (int j = 0; j < links.size(); j++) {
|
||||
var revLinks = linkDataDest2Src[links.getQuick(j)];
|
||||
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();
|
||||
}
|
||||
}
|
||||
|
||||
newRank.set(domainId, 0.85*newRankValue/rankNorm);
|
||||
}
|
||||
|
||||
return newRank;
|
||||
}
|
||||
|
||||
@Override
|
||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
||||
originDomainIds.forEach(id -> vector.increment(id, 1.0 / originDomainIds.size()));
|
||||
}
|
||||
|
||||
}
|
@ -1,50 +0,0 @@
|
||||
package nu.marginalia.ranking;
|
||||
|
||||
|
||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
||||
|
||||
public class StandardPageRank extends RankingAlgorithm {
|
||||
|
||||
public StandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
RankVector createNewRankVector(RankVector rank) {
|
||||
RankVector newRank = new RankVector(0);
|
||||
|
||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
||||
|
||||
var links = linkDataDest2Src[domainId];
|
||||
double newRankValue = 0;
|
||||
|
||||
if (links != null && links.size() > 0) {
|
||||
for (int j = 0; j < links.size(); j++) {
|
||||
int linkedDomain = links.getQuick(j);
|
||||
|
||||
final int linkSize;
|
||||
var backLinks = linkDataSrc2Dest[linkedDomain];
|
||||
|
||||
if (backLinks == null) {
|
||||
linkSize = 1;
|
||||
}
|
||||
else {
|
||||
linkSize = backLinks.size();
|
||||
}
|
||||
|
||||
newRankValue += rank.get(linkedDomain) / linkSize;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
newRank.set(domainId, 0.85 * newRankValue);
|
||||
}
|
||||
return newRank;
|
||||
}
|
||||
|
||||
@Override
|
||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
||||
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() ));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,63 @@
|
||||
package nu.marginalia.ranking.data;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import org.jgrapht.Graph;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
public abstract class AbstractGraphSource implements GraphSource {
|
||||
protected final HikariDataSource dataSource;
|
||||
|
||||
protected AbstractGraphSource(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
@Override
|
||||
public abstract Graph<Integer, ?> getGraph();
|
||||
|
||||
/** Adds all indexed domain ids as vertices to the graph. */
|
||||
protected void addVertices(Graph<Integer, ?> graph) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT ID
|
||||
FROM EC_DOMAIN
|
||||
WHERE NODE_AFFINITY > 0
|
||||
""");
|
||||
var rs = stmt.executeQuery())
|
||||
{
|
||||
while (rs.next()) {
|
||||
graph.addVertex(rs.getInt(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Integer> domainIds(List<String> domainNameList) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT ID
|
||||
FROM EC_DOMAIN
|
||||
WHERE DOMAIN_NAME LIKE ?
|
||||
"""))
|
||||
{
|
||||
Set<Integer> retSet = new HashSet<>();
|
||||
|
||||
for (String domainName : domainNameList) {
|
||||
stmt.setString(1, domainName);
|
||||
try (var rs = stmt.executeQuery()) {
|
||||
while (rs.next()) {
|
||||
retSet.add(rs.getInt(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var ret = new ArrayList<>(retSet);
|
||||
ret.sort(Comparator.naturalOrder());
|
||||
return ret;
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,23 @@
|
||||
package nu.marginalia.ranking.data;
|
||||
|
||||
import org.jgrapht.Graph;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/** A source for the link graph (or pseudo-link graph)
|
||||
* to use when ranking domain. */
|
||||
public interface GraphSource {
|
||||
|
||||
/** Construct the graph */
|
||||
Graph<Integer, ?> getGraph();
|
||||
|
||||
/** Return a list of domain ids for the given domain names.
|
||||
* The function will also accept SQL-style wildcards,
|
||||
* e.g. "%marginalia.nu" will match "marginalia.nu" and "memex.marginalia.nu".
|
||||
* <p></p>
|
||||
* If multiple wildcards are provided, and overlapping domains are matched,
|
||||
* they will be included only once. The returned list will be sorted in
|
||||
* numerical order of the domain IDs.
|
||||
*/
|
||||
List<Integer> domainIds(List<String> domainNameList);
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
package nu.marginalia.ranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.query.client.QueryClient;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
import org.jgrapht.graph.DefaultEdge;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/** A source for the inverted link graph,
|
||||
* which is the same as the regular graph except
|
||||
* the direction of the links have been inverted */
|
||||
public class InvertedLinkGraphSource extends AbstractGraphSource {
|
||||
private final QueryClient queryClient;
|
||||
|
||||
@Inject
|
||||
public InvertedLinkGraphSource(HikariDataSource dataSource, QueryClient queryClient) {
|
||||
super(dataSource);
|
||||
this.queryClient = queryClient;
|
||||
}
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
|
||||
addVertices(graph);
|
||||
|
||||
var allLinks = queryClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
continue;
|
||||
}
|
||||
if (!graph.containsVertex(iter.source())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Invert the edge
|
||||
graph.addEdge(iter.dest(), iter.source());
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
}
|
@ -0,0 +1,43 @@
|
||||
package nu.marginalia.ranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.query.client.QueryClient;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
import org.jgrapht.graph.DefaultEdge;
|
||||
|
||||
/** A source for the regular link graph. */
|
||||
public class LinkGraphSource extends AbstractGraphSource {
|
||||
private final QueryClient queryClient;
|
||||
|
||||
@Inject
|
||||
public LinkGraphSource(HikariDataSource dataSource, QueryClient queryClient) {
|
||||
super(dataSource);
|
||||
this.queryClient = queryClient;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
|
||||
addVertices(graph);
|
||||
|
||||
var allLinks = queryClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
continue;
|
||||
}
|
||||
if (!graph.containsVertex(iter.source())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
graph.addEdge(iter.source(), iter.dest());
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
}
|
@ -1,32 +0,0 @@
|
||||
package nu.marginalia.ranking.data;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class RankingDomainData {
|
||||
public final int id;
|
||||
public final String name;
|
||||
private int alias;
|
||||
public DomainIndexingState state;
|
||||
public final int knownUrls;
|
||||
|
||||
public int resolveAlias() {
|
||||
if (alias == 0) return id;
|
||||
return alias;
|
||||
}
|
||||
|
||||
public boolean isAlias() {
|
||||
return alias != 0;
|
||||
}
|
||||
|
||||
public boolean isSpecial() {
|
||||
return DomainIndexingState.SPECIAL == state;
|
||||
}
|
||||
|
||||
public boolean isSocialMedia() {
|
||||
return DomainIndexingState.SOCIAL_MEDIA == state;
|
||||
}
|
||||
}
|
@ -1,138 +0,0 @@
|
||||
package nu.marginalia.ranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.query.client.QueryClient;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.IntConsumer;
|
||||
|
||||
@Singleton
|
||||
public class RankingDomainFetcher {
|
||||
protected final HikariDataSource dataSource;
|
||||
private final QueryClient queryClient;
|
||||
protected final DomainBlacklistImpl blacklist;
|
||||
protected final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
protected boolean getNames = false;
|
||||
|
||||
@Inject
|
||||
public RankingDomainFetcher(HikariDataSource dataSource,
|
||||
QueryClient queryClient,
|
||||
DomainBlacklistImpl blacklist) {
|
||||
this.dataSource = dataSource;
|
||||
this.queryClient = queryClient;
|
||||
this.blacklist = blacklist;
|
||||
}
|
||||
|
||||
public void retainNames() {
|
||||
this.getNames = true;
|
||||
}
|
||||
|
||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
||||
String query;
|
||||
if (getNames) {
|
||||
query = """
|
||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE NODE_AFFINITY>0
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
}
|
||||
else {
|
||||
query = """
|
||||
SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE NODE_AFFINITY>0
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
}
|
||||
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
|
||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
||||
String query;
|
||||
if (getNames) {
|
||||
query = """
|
||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE ((INDEXED>1 AND IS_ALIVE)
|
||||
OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0))
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
}
|
||||
else {
|
||||
query = """
|
||||
SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE ((INDEXED>1 AND IS_ALIVE)
|
||||
OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0))
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
}
|
||||
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
protected void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
||||
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
consumer.accept(
|
||||
new RankingDomainData(id,
|
||||
rsp.getString(2),
|
||||
rsp.getInt(3),
|
||||
DomainIndexingState.valueOf(rsp.getString(4)),
|
||||
rsp.getInt(5)));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch domains", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
||||
|
||||
var allLinks = queryClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
|
||||
while (iter.advance()) {
|
||||
consumer.accept(iter.source(), iter.dest());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement()) {
|
||||
// This is sourced from a config file --v
|
||||
var rsp = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE '" + pattern + "'");
|
||||
while (rsp.next()) {
|
||||
idConsumer.accept(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch domains by pattern", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public interface DomainLinkConsumer {
|
||||
void accept(int from, int to);
|
||||
}
|
||||
}
|
@ -1,93 +0,0 @@
|
||||
package nu.marginalia.ranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.query.client.QueryClient;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
@Singleton
|
||||
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
|
||||
final boolean hasData;
|
||||
|
||||
@Inject
|
||||
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, QueryClient queryClient, DomainBlacklistImpl blacklist) {
|
||||
super(dataSource, queryClient, blacklist);
|
||||
|
||||
hasData = isDomainNeighborTablePopulated(dataSource);
|
||||
}
|
||||
|
||||
private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement();
|
||||
var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) {
|
||||
|
||||
return rs.next();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
LoggerFactory
|
||||
.getLogger(RankingDomainFetcherForSimilarityData.class)
|
||||
.error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
public boolean hasData() {
|
||||
return hasData;
|
||||
}
|
||||
|
||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2"))
|
||||
{
|
||||
stmt.setFetchSize(10000);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
int src = rsp.getInt(1);
|
||||
int dst = rsp.getInt(2);
|
||||
|
||||
// these "links" are bidi
|
||||
consumer.accept(src, dst);
|
||||
consumer.accept(dst, src);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch domain links", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
||||
String query;
|
||||
if (getNames) {
|
||||
query =
|
||||
"""
|
||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
}
|
||||
else {
|
||||
query =
|
||||
"""
|
||||
SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
}
|
||||
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
|
||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
||||
// This is not relevant for this variant of pagerank since it is bidirectional
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,65 @@
|
||||
package nu.marginalia.ranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
|
||||
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||
|
||||
import java.sql.SQLException;
|
||||
|
||||
/** A source for the similarity graph, stored in EC_DOMAIN_NEIGHBORS_2,
|
||||
* which contains the cosine similarity of the incident link vectors in the link graph.
|
||||
* */
|
||||
public class SimilarityGraphSource extends AbstractGraphSource {
|
||||
@Inject
|
||||
public SimilarityGraphSource(HikariDataSource dataSource) {
|
||||
super(dataSource);
|
||||
}
|
||||
|
||||
/** Check if the data source is available. */
|
||||
public boolean isAvailable() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT *
|
||||
FROM EC_DOMAIN_NEIGHBORS_2
|
||||
LIMIT 1
|
||||
""");
|
||||
var rs = stmt.executeQuery())
|
||||
{
|
||||
return rs.next();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
|
||||
|
||||
addVertices(graph);
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS
|
||||
FROM EC_DOMAIN_NEIGHBORS_2
|
||||
"""))
|
||||
{
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
int src = rs.getInt(1);
|
||||
int dest = rs.getInt(2);
|
||||
double weight = rs.getDouble(3);
|
||||
|
||||
graph.addEdge(src, dest);
|
||||
graph.setEdgeWeight(src, dest, weight);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
}
|
@ -0,0 +1,375 @@
|
||||
package nu.marginalia.ranking.jgrapht;
|
||||
|
||||
/*
|
||||
* (C) Copyright 2016-2023, by Dimitrios Michail and Contributors.
|
||||
*
|
||||
*
|
||||
* JGraphT : a free Java graph-theory library
|
||||
*
|
||||
* See the CONTRIBUTORS.md file distributed with this work for additional
|
||||
* information regarding copyright ownership.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Eclipse Public License 2.0 which is available at
|
||||
* http://www.eclipse.org/legal/epl-2.0, or the
|
||||
* GNU Lesser General Public License v2.1 or later
|
||||
* which is available at
|
||||
* http://www.gnu.org/licenses/old-licenses/lgpl-2.1-standalone.html.
|
||||
*
|
||||
* SPDX-License-Identifier: EPL-2.0 OR LGPL-2.1-or-later
|
||||
*/
|
||||
|
||||
/* (modified by @vlofgren to add personalization) */
|
||||
|
||||
import org.jgrapht.*;
|
||||
import org.jgrapht.alg.interfaces.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class PersonalizedPageRank<V, E>
|
||||
implements VertexScoringAlgorithm<V, Double>
|
||||
{
|
||||
/**
|
||||
* Default number of maximum iterations.
|
||||
*/
|
||||
public static final int MAX_ITERATIONS_DEFAULT = 100;
|
||||
|
||||
/**
|
||||
* Default value for the tolerance. The calculation will stop if the difference of PageRank
|
||||
* values between iterations change less than this value.
|
||||
*/
|
||||
public static final double TOLERANCE_DEFAULT = 0.0001;
|
||||
|
||||
/**
|
||||
* Damping factor default value.
|
||||
*/
|
||||
public static final double DAMPING_FACTOR_DEFAULT = 0.85d;
|
||||
|
||||
/**
|
||||
* The input graph
|
||||
*/
|
||||
private final Graph<V, E> graph;
|
||||
private final Collection<V> influenceSet;
|
||||
|
||||
/**
|
||||
* The damping factor
|
||||
*/
|
||||
private final double dampingFactor;
|
||||
|
||||
/**
|
||||
* Maximum iterations to run
|
||||
*/
|
||||
private final int maxIterations;
|
||||
|
||||
/**
|
||||
* The calculation will stop if the difference of PageRank values between iterations change less
|
||||
* than this value
|
||||
*/
|
||||
private final double tolerance;
|
||||
|
||||
/**
|
||||
* The result
|
||||
*/
|
||||
private Map<V, Double> scores;
|
||||
|
||||
/**
|
||||
* Create and execute an instance of Personalized PageRank.
|
||||
*
|
||||
* @param graph the input graph
|
||||
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||
*/
|
||||
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet)
|
||||
{
|
||||
this(graph, influenceSet, DAMPING_FACTOR_DEFAULT, MAX_ITERATIONS_DEFAULT, TOLERANCE_DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and execute an instance of Personalized PageRank.
|
||||
*
|
||||
* @param graph the input graph
|
||||
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||
* @param dampingFactor the damping factor
|
||||
*/
|
||||
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor)
|
||||
{
|
||||
this(graph, influenceSet, dampingFactor, MAX_ITERATIONS_DEFAULT, TOLERANCE_DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and execute an instance of Personalized PageRank.
|
||||
*
|
||||
* @param graph the input graph
|
||||
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||
* @param dampingFactor the damping factor
|
||||
* @param maxIterations the maximum number of iterations to perform
|
||||
*/
|
||||
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor, int maxIterations)
|
||||
{
|
||||
this(graph, influenceSet, dampingFactor, maxIterations, TOLERANCE_DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and execute an instance of Personalized PageRank.
|
||||
*
|
||||
* @param graph the input graph
|
||||
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||
* @param dampingFactor the damping factor
|
||||
* @param maxIterations the maximum number of iterations to perform
|
||||
* @param tolerance the calculation will stop if the difference of Personalized PageRank values between
|
||||
* iterations change less than this value
|
||||
*/
|
||||
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor, int maxIterations, double tolerance)
|
||||
{
|
||||
this.graph = graph;
|
||||
this.influenceSet = influenceSet;
|
||||
|
||||
if (maxIterations <= 0) {
|
||||
throw new IllegalArgumentException("Maximum iterations must be positive");
|
||||
}
|
||||
this.maxIterations = maxIterations;
|
||||
|
||||
if (dampingFactor < 0.0 || dampingFactor > 1.0) {
|
||||
throw new IllegalArgumentException("Damping factor not valid");
|
||||
}
|
||||
this.dampingFactor = dampingFactor;
|
||||
|
||||
if (tolerance <= 0.0) {
|
||||
throw new IllegalArgumentException("Tolerance not valid, must be positive");
|
||||
}
|
||||
this.tolerance = tolerance;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Map<V, Double> getScores()
|
||||
{
|
||||
if (scores == null) {
|
||||
scores = Collections.unmodifiableMap(new Algorithm().getScores());
|
||||
}
|
||||
return scores;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Double getVertexScore(V v)
|
||||
{
|
||||
if (!graph.containsVertex(v)) {
|
||||
throw new IllegalArgumentException("Cannot return score of unknown vertex");
|
||||
}
|
||||
return getScores().get(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* The actual implementation.
|
||||
*
|
||||
* <p>
|
||||
* We use this pattern with the inner class in order to be able to cache the result but also
|
||||
* allow the garbage collector to acquire all auxiliary memory used during the execution of the
|
||||
* algorithm.
|
||||
*
|
||||
* @author Dimitrios Michail
|
||||
*
|
||||
* @param <V> the graph type
|
||||
* @param <E> the edge type
|
||||
*/
|
||||
private class Algorithm
|
||||
{
|
||||
private int totalVertices;
|
||||
private boolean isWeighted;
|
||||
|
||||
private Map<V, Integer> vertexIndexMap;
|
||||
private V[] vertexMap;
|
||||
|
||||
private double[] weightSum;
|
||||
private double[] curScore;
|
||||
private double[] nextScore;
|
||||
private int[] outDegree;
|
||||
private ArrayList<int[]> adjList;
|
||||
private ArrayList<double[]> weightsList;
|
||||
private BitSet influenceIndexSet;
|
||||
@SuppressWarnings("unchecked")
|
||||
public Algorithm()
|
||||
{
|
||||
this.totalVertices = graph.vertexSet().size();
|
||||
this.isWeighted = graph.getType().isWeighted();
|
||||
|
||||
/*
|
||||
* Initialize score, map vertices to [0,n) and pre-compute degrees and adjacency lists
|
||||
*/
|
||||
this.curScore = new double[totalVertices];
|
||||
this.nextScore = new double[totalVertices];
|
||||
this.vertexIndexMap = new HashMap<>();
|
||||
this.vertexMap = (V[]) new Object[totalVertices];
|
||||
this.outDegree = new int[totalVertices];
|
||||
this.adjList = new ArrayList<>(totalVertices);
|
||||
this.influenceIndexSet = new BitSet(totalVertices);
|
||||
|
||||
double initScore = 1.0d / totalVertices;
|
||||
int i = 0;
|
||||
for (V v : graph.vertexSet()) {
|
||||
vertexIndexMap.put(v, i);
|
||||
vertexMap[i] = v;
|
||||
outDegree[i] = graph.outDegreeOf(v);
|
||||
curScore[i] = initScore;
|
||||
|
||||
if (influenceSet.contains(v)) {
|
||||
influenceIndexSet.set(i);
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
if (isWeighted) {
|
||||
this.weightSum = new double[totalVertices];
|
||||
this.weightsList = new ArrayList<>(totalVertices);
|
||||
|
||||
for (i = 0; i < totalVertices; i++) {
|
||||
V v = vertexMap[i];
|
||||
int[] inNeighbors = new int[graph.inDegreeOf(v)];
|
||||
double[] edgeWeights = new double[graph.inDegreeOf(v)];
|
||||
|
||||
int j = 0;
|
||||
for (E e : graph.incomingEdgesOf(v)) {
|
||||
V w = Graphs.getOppositeVertex(graph, e, v);
|
||||
Integer mappedVertexId = vertexIndexMap.get(w);
|
||||
inNeighbors[j] = mappedVertexId;
|
||||
double edgeWeight = graph.getEdgeWeight(e);
|
||||
edgeWeights[j] += edgeWeight;
|
||||
weightSum[mappedVertexId] += edgeWeight;
|
||||
j++;
|
||||
}
|
||||
weightsList.add(edgeWeights);
|
||||
adjList.add(inNeighbors);
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < totalVertices; i++) {
|
||||
V v = vertexMap[i];
|
||||
int[] inNeighbors = new int[graph.inDegreeOf(v)];
|
||||
int j = 0;
|
||||
for (E e : graph.incomingEdgesOf(v)) {
|
||||
V w = Graphs.getOppositeVertex(graph, e, v);
|
||||
inNeighbors[j++] = vertexIndexMap.get(w);
|
||||
}
|
||||
adjList.add(inNeighbors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Map<V, Double> getScores()
|
||||
{
|
||||
// compute
|
||||
if (isWeighted) {
|
||||
runWeighted();
|
||||
} else {
|
||||
run();
|
||||
}
|
||||
|
||||
// make results user friendly
|
||||
Map<V, Double> scores = new HashMap<>();
|
||||
for (int i = 0; i < totalVertices; i++) {
|
||||
V v = vertexMap[i];
|
||||
scores.put(v, curScore[i]);
|
||||
}
|
||||
return scores;
|
||||
}
|
||||
|
||||
private void run()
|
||||
{
|
||||
double maxChange = tolerance;
|
||||
int iterations = maxIterations;
|
||||
|
||||
while (iterations > 0 && maxChange >= tolerance) {
|
||||
double r = teleProp();
|
||||
|
||||
maxChange = 0d;
|
||||
for (int i = 0; i < totalVertices; i++) {
|
||||
double contribution = 0d;
|
||||
for (int w : adjList.get(i)) {
|
||||
contribution += dampingFactor * curScore[w] / outDegree[w];
|
||||
}
|
||||
|
||||
double vOldValue = curScore[i];
|
||||
double vNewValue = (influenceIndexSet.get(i) ? r : 0) + contribution;
|
||||
maxChange = Math.max(maxChange, Math.abs(vNewValue - vOldValue));
|
||||
nextScore[i] = vNewValue;
|
||||
}
|
||||
|
||||
// progress
|
||||
swapScores();
|
||||
iterations--;
|
||||
}
|
||||
|
||||
// remove influence factor from the scores
|
||||
double r = teleProp();
|
||||
for (int i = 0; i < totalVertices; i++) {
|
||||
curScore[i] -= (influenceIndexSet.get(i) ? r : 0);
|
||||
}
|
||||
}
|
||||
|
||||
private void runWeighted()
|
||||
{
|
||||
double maxChange = tolerance;
|
||||
int iterations = maxIterations;
|
||||
|
||||
while (iterations > 0 && maxChange >= tolerance) {
|
||||
double r = teleProp();
|
||||
|
||||
maxChange = 0d;
|
||||
for (int i = 0; i < totalVertices; i++) {
|
||||
double contribution = 0d;
|
||||
|
||||
int[] neighbors = adjList.get(i);
|
||||
double[] weights = weightsList.get(i);
|
||||
for (int j = 0, getLength = neighbors.length; j < getLength; j++) {
|
||||
int w = neighbors[j];
|
||||
contribution += dampingFactor * curScore[w] * weights[j] / weightSum[w];
|
||||
}
|
||||
|
||||
double vOldValue = curScore[i];
|
||||
double vNewValue = (influenceIndexSet.get(i) ? r : 0) + contribution;
|
||||
maxChange = Math.max(maxChange, Math.abs(vNewValue - vOldValue));
|
||||
nextScore[i] = vNewValue;
|
||||
}
|
||||
|
||||
// progress
|
||||
swapScores();
|
||||
iterations--;
|
||||
}
|
||||
|
||||
// remove influence factor from the scores
|
||||
double r = teleProp();
|
||||
for (int i = 0; i < totalVertices; i++) {
|
||||
curScore[i] -= (influenceIndexSet.get(i) ? r : 0);
|
||||
}
|
||||
}
|
||||
|
||||
// This is the teleportation part of the algorithm, and also what is modified to personalize the PageRank
|
||||
private double teleProp()
|
||||
{
|
||||
double r = 0d;
|
||||
for (int v = influenceIndexSet.nextSetBit(0);
|
||||
v >= 0;
|
||||
v = influenceIndexSet.nextSetBit(v + 1))
|
||||
{
|
||||
if (outDegree[v] > 0)
|
||||
r += (1d - dampingFactor);
|
||||
else
|
||||
r += curScore[v];
|
||||
}
|
||||
return r / influenceSet.size();
|
||||
}
|
||||
|
||||
private void swapScores()
|
||||
{
|
||||
double[] tmp = curScore;
|
||||
curScore = nextScore;
|
||||
nextScore = tmp;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,78 @@
|
||||
package nu.marginalia.ranking;
|
||||
|
||||
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
// Test the ranking algorithm with prod data. Will not run if the data is not available.
|
||||
// It's not feasible to include the data in the git repo, as it's ~6 GB of data.
|
||||
@Disabled
|
||||
class RankingAlgorithmWithRealDataTest {
|
||||
|
||||
@Test
|
||||
public void testRegularPR() {
|
||||
if (!TestGraphSourceForLinkData.isAvailable()) {
|
||||
return;
|
||||
}
|
||||
|
||||
var graphSource = new TestGraphSourceForLinkData();
|
||||
var results = new PageRankDomainRanker(graphSource, List.of())
|
||||
.calculate(10, RankingResultListAccumulator::new);
|
||||
|
||||
for (int i = 0; i < results.size(); i++) {
|
||||
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInvertedLinkGraph() {
|
||||
if (!TestGraphSourceForInvertedLinkData.isAvailable()) {
|
||||
return;
|
||||
}
|
||||
|
||||
var graphSource = new TestGraphSourceForInvertedLinkData();
|
||||
var results = new PageRankDomainRanker(graphSource, List.of())
|
||||
.calculate(10, RankingResultListAccumulator::new);
|
||||
|
||||
for (int i = 0; i < results.size(); i++) {
|
||||
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarityPR() {
|
||||
if (!TestGraphSourceForSimilarityData.isAvailable()) {
|
||||
return;
|
||||
}
|
||||
|
||||
var graphSource = new TestGraphSourceForSimilarityData();
|
||||
var results = new PageRankDomainRanker(graphSource, List.of())
|
||||
.calculate(10, RankingResultListAccumulator::new);
|
||||
|
||||
for (int i = 0; i < results.size(); i++) {
|
||||
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimilarityPPR() {
|
||||
if (!TestGraphSourceForSimilarityData.isAvailable()) {
|
||||
return;
|
||||
}
|
||||
|
||||
var graphSource = new TestGraphSourceForSimilarityData();
|
||||
var results = new PageRankDomainRanker(graphSource,
|
||||
List.of(1476552) // wiby.me
|
||||
)
|
||||
.calculate(10, RankingResultListAccumulator::new);
|
||||
|
||||
for (int i = 0; i < results.size(); i++) {
|
||||
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,161 @@
|
||||
package nu.marginalia.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.query.client.QueryClient;
|
||||
import nu.marginalia.ranking.data.InvertedLinkGraphSource;
|
||||
import nu.marginalia.ranking.data.LinkGraphSource;
|
||||
import nu.marginalia.ranking.data.SimilarityGraphSource;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
import org.mockito.Mockito;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
@Tag("slow")
|
||||
@Testcontainers
|
||||
@Execution(SAME_THREAD)
|
||||
public class RankingAlgorithmsContainerTest {
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
|
||||
QueryClient queryClient;
|
||||
QueryClient.AllLinks allLinks;
|
||||
@BeforeAll
|
||||
public static void setup() {
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
TestMigrationLoader.flywayMigration(dataSource);
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement()) {
|
||||
stmt.executeUpdate("""
|
||||
INSERT INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
|
||||
VALUES ('memex.marginalia.nu', 'marginalia.nu', 1),
|
||||
('search.marginalia.nu', 'marginalia.nu', 1),
|
||||
('encyclopedia.marginalia.nu', 'marginalia.nu', 1),
|
||||
('marginalia.nu', 'marginalia.nu', 1);
|
||||
""");
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setupQueryClient() {
|
||||
queryClient = Mockito.mock(QueryClient.class);
|
||||
allLinks = new QueryClient.AllLinks();
|
||||
when(queryClient.getAllDomainLinks()).thenReturn(allLinks);
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement()) {
|
||||
stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_NEIGHBORS_2");
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private void addSimilarity(int source, int dest, double similarity) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT INTO EC_DOMAIN_NEIGHBORS_2(DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS)
|
||||
VALUES (?, ?, ?)
|
||||
""")) {
|
||||
stmt.setInt(1, source);
|
||||
stmt.setInt(2, dest);
|
||||
stmt.setDouble(3, similarity);
|
||||
stmt.executeUpdate();
|
||||
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetDomains() {
|
||||
// should all be the same, doesn't matter which one we use
|
||||
var source = new LinkGraphSource(dataSource, queryClient);
|
||||
|
||||
Assertions.assertEquals(List.of(1),
|
||||
source.domainIds(List.of("memex.marginalia.nu")));
|
||||
|
||||
// Verify globbing
|
||||
Assertions.assertEquals(List.of(1,2,3),
|
||||
source.domainIds(List.of("%.marginalia.nu")));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLinkGraphSource() {
|
||||
allLinks.add(1, 3);
|
||||
|
||||
var graph = new LinkGraphSource(dataSource, queryClient).getGraph();
|
||||
|
||||
Assertions.assertTrue(graph.containsVertex(1));
|
||||
Assertions.assertTrue(graph.containsVertex(2));
|
||||
Assertions.assertTrue(graph.containsVertex(3));
|
||||
|
||||
Assertions.assertTrue(graph.containsEdge(1, 3));
|
||||
|
||||
Assertions.assertFalse(graph.containsEdge(3, 1));
|
||||
Assertions.assertFalse(graph.containsEdge(2, 3));
|
||||
Assertions.assertFalse(graph.containsEdge(3, 2));
|
||||
}
|
||||
@Test
|
||||
public void testInvertedLinkGraphSource() {
|
||||
allLinks.add(1, 3);
|
||||
|
||||
var graph = new InvertedLinkGraphSource(dataSource, queryClient).getGraph();
|
||||
|
||||
Assertions.assertTrue(graph.containsVertex(1));
|
||||
Assertions.assertTrue(graph.containsVertex(2));
|
||||
Assertions.assertTrue(graph.containsVertex(3));
|
||||
|
||||
Assertions.assertTrue(graph.containsEdge(3, 1));
|
||||
|
||||
Assertions.assertFalse(graph.containsEdge(1, 3));
|
||||
Assertions.assertFalse(graph.containsEdge(2, 3));
|
||||
Assertions.assertFalse(graph.containsEdge(3, 2));
|
||||
}
|
||||
@Test
|
||||
@SuppressWarnings("unchecked")
|
||||
public void testSimilarityGraphSource() {
|
||||
|
||||
addSimilarity(1, 3, 0.5);
|
||||
|
||||
var graph = (Graph<Integer, DefaultWeightedEdge>) new SimilarityGraphSource(dataSource).getGraph();
|
||||
|
||||
Assertions.assertTrue(graph.containsVertex(1));
|
||||
Assertions.assertTrue(graph.containsVertex(2));
|
||||
Assertions.assertTrue(graph.containsVertex(3));
|
||||
|
||||
Assertions.assertTrue(graph.containsEdge(3, 1));
|
||||
Assertions.assertTrue(graph.containsEdge(1, 3));
|
||||
Assertions.assertEquals(graph.getEdgeWeight(graph.getEdge(1, 3)), 0.5, 0.0001);
|
||||
|
||||
Assertions.assertFalse(graph.containsEdge(1, 2));
|
||||
Assertions.assertFalse(graph.containsEdge(2, 3));
|
||||
}
|
||||
}
|
@ -0,0 +1,86 @@
|
||||
package nu.marginalia.ranking;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.ranking.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
import org.jgrapht.graph.DefaultEdge;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class TestGraphSourceForInvertedLinkData implements GraphSource {
|
||||
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
|
||||
private static Path[] linksDataPaths = new Path[] {
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-1.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-2.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-3.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-4.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-5.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-6.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-7.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-8.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-9.dat")
|
||||
};
|
||||
|
||||
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
|
||||
|
||||
static boolean isAvailable() {
|
||||
return Files.exists(domainDataPath) && Files.exists(linksDataPaths[0]);
|
||||
}
|
||||
|
||||
private Map<Integer, String> idToName = new HashMap<>();
|
||||
|
||||
public String getName(int id) {
|
||||
return idToName.get(id);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
idToName = new HashMap<>();
|
||||
|
||||
try (var stream = Files
|
||||
.lines(domainDataPath)) {
|
||||
|
||||
stream.skip(1)
|
||||
.mapMultiToInt((line, c) -> {
|
||||
String[] parts = StringUtils.split(line, '\t');
|
||||
int id = Integer.parseInt(parts[0]);
|
||||
String name = parts[1];
|
||||
int node_affinity = Integer.parseInt(parts[3]);
|
||||
if (node_affinity > 0) {
|
||||
c.accept(id);
|
||||
idToName.put(id, parts[1]);
|
||||
}
|
||||
})
|
||||
.forEach(graph::addVertex);
|
||||
}
|
||||
|
||||
for (var path : linksDataPaths) {
|
||||
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
|
||||
data.forEach(0, data.size(), (pos, val) -> {
|
||||
|
||||
val = Long.reverseBytes(val); // data is in "java endian", LongArray is in "C endian"
|
||||
|
||||
int src = (int) (val >>> 32);
|
||||
int dest = (int) (val & 0xFFFF_FFFFL);
|
||||
|
||||
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
|
||||
graph.addEdge(dest, src);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,86 @@
|
||||
package nu.marginalia.ranking;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.ranking.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
import org.jgrapht.graph.DefaultEdge;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class TestGraphSourceForLinkData implements GraphSource {
|
||||
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
|
||||
private static Path[] linksDataPaths = new Path[] {
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-1.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-2.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-3.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-4.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-5.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-6.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-7.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-8.dat"),
|
||||
Paths.get("/home/vlofgren/Exports/Links/domain-links-9.dat")
|
||||
};
|
||||
|
||||
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
|
||||
|
||||
static boolean isAvailable() {
|
||||
return Files.exists(domainDataPath) && Files.exists(linksDataPaths[0]);
|
||||
}
|
||||
|
||||
private Map<Integer, String> idToName = new HashMap<>();
|
||||
|
||||
public String getName(int id) {
|
||||
return idToName.get(id);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
idToName = new HashMap<>();
|
||||
|
||||
try (var stream = Files
|
||||
.lines(domainDataPath)) {
|
||||
|
||||
stream.skip(1)
|
||||
.mapMultiToInt((line, c) -> {
|
||||
String[] parts = StringUtils.split(line, '\t');
|
||||
int id = Integer.parseInt(parts[0]);
|
||||
String name = parts[1];
|
||||
int node_affinity = Integer.parseInt(parts[3]);
|
||||
if (node_affinity > 0) {
|
||||
c.accept(id);
|
||||
idToName.put(id, parts[1]);
|
||||
}
|
||||
})
|
||||
.forEach(graph::addVertex);
|
||||
}
|
||||
|
||||
for (var path : linksDataPaths) {
|
||||
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
|
||||
data.forEach(0, data.size(), (pos, val) -> {
|
||||
|
||||
val = Long.reverseBytes(val); // data is in "java endian", LongArray is in "C endian"
|
||||
|
||||
int src = (int) (val >>> 32);
|
||||
int dest = (int) (val & 0xFFFF_FFFFL);
|
||||
|
||||
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
|
||||
graph.addEdge(src, dest);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,78 @@
|
||||
package nu.marginalia.ranking;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.ranking.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
import org.jgrapht.graph.DefaultEdge;
|
||||
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
|
||||
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class TestGraphSourceForSimilarityData implements GraphSource {
|
||||
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
|
||||
private static Path similarityDataPath = Paths.get("/home/vlofgren/Exports/Links/neighbors.tsv");
|
||||
|
||||
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
|
||||
|
||||
static boolean isAvailable() {
|
||||
return Files.exists(domainDataPath) && Files.exists(similarityDataPath);
|
||||
}
|
||||
|
||||
private Map<Integer, String> idToName = new HashMap<>();
|
||||
|
||||
public String getName(int id) {
|
||||
return idToName.get(id);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
|
||||
idToName = new HashMap<>();
|
||||
|
||||
try (var stream = Files
|
||||
.lines(domainDataPath)) {
|
||||
|
||||
stream.skip(1)
|
||||
.mapMultiToInt((line, c) -> {
|
||||
String[] parts = StringUtils.split(line, '\t');
|
||||
int id = Integer.parseInt(parts[0]);
|
||||
String name = parts[1];
|
||||
int node_affinity = Integer.parseInt(parts[3]);
|
||||
if (node_affinity > 0) {
|
||||
c.accept(id);
|
||||
idToName.put(id, name);
|
||||
}
|
||||
})
|
||||
.forEach(graph::addVertex);
|
||||
}
|
||||
|
||||
try (var stream = Files
|
||||
.lines(similarityDataPath)) {
|
||||
|
||||
stream.skip(1)
|
||||
.forEach(line -> {
|
||||
String[] parts = StringUtils.split(line, '\t');
|
||||
int src = Integer.parseInt(parts[0]);
|
||||
int dest = Integer.parseInt(parts[1]);
|
||||
double weight = Double.parseDouble(parts[2]);
|
||||
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
|
||||
graph.addEdge(src, dest);
|
||||
graph.setEdgeWeight(src, dest, weight);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
||||
}
|
@ -15,15 +15,12 @@ import java.sql.SQLException;
|
||||
import java.util.Map;
|
||||
|
||||
public class ControlDomainRankingSetsService {
|
||||
private final HikariDataSource dataSource;
|
||||
private final ControlRendererFactory rendererFactory;
|
||||
private final DomainRankingSetsService domainRankingSetsService;
|
||||
|
||||
@Inject
|
||||
public ControlDomainRankingSetsService(HikariDataSource dataSource,
|
||||
ControlRendererFactory rendererFactory,
|
||||
public ControlDomainRankingSetsService(ControlRendererFactory rendererFactory,
|
||||
DomainRankingSetsService domainRankingSetsService) {
|
||||
this.dataSource = dataSource;
|
||||
this.rendererFactory = rendererFactory;
|
||||
this.domainRankingSetsService = domainRankingSetsService;
|
||||
}
|
||||
@ -47,7 +44,6 @@ public class ControlDomainRankingSetsService {
|
||||
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
|
||||
id,
|
||||
request.queryParams("description"),
|
||||
DomainRankingSetsService.DomainSetAlgorithm.valueOf(request.queryParams("algorithm")),
|
||||
Integer.parseInt(request.queryParams("depth")),
|
||||
request.queryParams("definition")
|
||||
));
|
||||
@ -77,7 +73,6 @@ public class ControlDomainRankingSetsService {
|
||||
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
|
||||
request.queryParams("name").toUpperCase(),
|
||||
request.queryParams("description"),
|
||||
DomainRankingSetsService.DomainSetAlgorithm.valueOf(request.queryParams("algorithm")),
|
||||
Integer.parseInt(request.queryParams("depth")),
|
||||
request.queryParams("definition")
|
||||
));
|
||||
@ -95,17 +90,6 @@ public class ControlDomainRankingSetsService {
|
||||
}
|
||||
private Object rankingSetModel(Request request, Response response) throws SQLException {
|
||||
var model = domainRankingSetsService.get(request.params("id")).orElseThrow();
|
||||
return Map.of("rankingSet", model,
|
||||
"selectedAlgo", Map.of(
|
||||
"special", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.SPECIAL,
|
||||
"adjacency_cheirank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_CHEIRANK,
|
||||
"adjacency_pagerank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_PAGERANK,
|
||||
"links_cheirank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.LINKS_CHEIRANK,
|
||||
"links_pagerank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.LINKS_PAGERANK)
|
||||
);
|
||||
|
||||
|
||||
|
||||
|
||||
return Map.of("rankingSet", model);
|
||||
}
|
||||
}
|
||||
|
@ -16,14 +16,12 @@
|
||||
<tr>
|
||||
<th>Name</th>
|
||||
<th>Description</th>
|
||||
<th>Algorithm</th>
|
||||
<th>Depth</th>
|
||||
</tr>
|
||||
{{#each rankingSets}}
|
||||
<tr>
|
||||
<td><a href="/domain-ranking-sets/{{name}}">{{name}}</td></td>
|
||||
<td>{{description}}</td>
|
||||
<td>{{algorithm}}</td>
|
||||
<td>{{depth}}</td>
|
||||
</tr>
|
||||
{{/each}}
|
||||
|
@ -21,23 +21,6 @@
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><label for="algorithm">Algorithm</label></th>
|
||||
<td>
|
||||
<select id="algorithm" name="algorithm">
|
||||
<option value="LINKS_PAGERANK">LINKS_PAGERANK</option>
|
||||
<option value="LINKS_CHEIRANK">LINKS_CHEIRANK</option>
|
||||
<option value="ADJACENCY_PAGERANK">ADJACENCY_PAGERANK</option>
|
||||
<option value="ADJACENCY_CHEIRANK">ADJACENCY_CHEIRANK</option>
|
||||
</select>
|
||||
<div>
|
||||
<small class="text-muted">
|
||||
The algorithm used to rank the domains. The LINKS algorithms use the link graph, and the ADJACENCY
|
||||
algorithms use the adjacency graph. CheiRank is a variant of PageRank that uses the reversed graph.
|
||||
</small>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><label for="description">Description</label></th>
|
||||
<td>
|
||||
@ -61,8 +44,12 @@
|
||||
<textarea name="definition" id="definition" rows="10" style="width: 100%">{{definition}}</textarea>
|
||||
<div>
|
||||
<small class="text-muted">A list of domain names, one per line, possibly globbed with SQL-style '%' wildcards.
|
||||
These are used as the origin point for the Personalized PageRank algorithm, and will be considered
|
||||
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used, as per the PageRank paper.
|
||||
If provided, these are used as the origin point for the Personalized PageRank algorithm, and will be considered
|
||||
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used,
|
||||
as per the PageRank paper.
|
||||
<br><br>
|
||||
If similarity data is available and domains are specified, the similarity data is used as basis for the ranking
|
||||
calculation instead, providing a much more coherent ranking.
|
||||
</small>
|
||||
</div>
|
||||
</td></tr>
|
||||
|
@ -22,27 +22,6 @@
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><label for="algorithm">Algorithm</label></th>
|
||||
<td>
|
||||
{{#if special}}<input type="hidden" name="algorithm" value="{{algorithm}}" />{{/if}}
|
||||
<select id="algorithm" name="algorithm" {{#if special}}disabled{{/if}}>
|
||||
{{#with algorithm}}
|
||||
<option value="SPECIAL" disabled {{#if selectedAlgo.special}}selected{{/if}}>SPECIAL</option>
|
||||
<option value="LINKS_PAGERANK" {{#if selectedAlgo.links_pagerank}}selected{{/if}}>LINKS_PAGERANK</option>
|
||||
<option value="LINKS_CHEIRANK" {{#if selectedAlgo.links_cheirank}}selected{{/if}}>LINKS_CHEIRANK</option>
|
||||
<option value="ADJACENCY_PAGERANK" {{#if selectedAlgo.adjacency_pagerank}}selected{{/if}}>ADJACENCY_PAGERANK</option>
|
||||
<option value="ADJACENCY_CHEIRANK" {{#if selectedAlgo.adjacency_cheirank}}selected{{/if}}>ADJACENCY_CHEIRANK</option>
|
||||
{{/with}}
|
||||
</select>
|
||||
<div>
|
||||
<small class="text-muted">
|
||||
The algorithm used to rank the domains. The LINKS algorithms use the link graph, and the ADJACENCY
|
||||
algorithms use the adjacency graph. CheiRank is a variant of PageRank that uses the reversed graph.
|
||||
</small>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><label for="description">Description</label></th>
|
||||
<td>
|
||||
@ -67,8 +46,12 @@
|
||||
<textarea name="definition" id="definition" rows="10" style="width: 100%">{{definition}}</textarea>
|
||||
<div>
|
||||
<small class="text-muted">A list of domain names, one per line, possibly globbed with SQL-style '%' wildcards.
|
||||
These are used as the origin point for the Personalized PageRank algorithm, and will be considered
|
||||
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used, as per the PageRank paper.
|
||||
If provided, these are used as the origin point for the Personalized PageRank algorithm, and will be considered
|
||||
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used,
|
||||
as per the PageRank paper.
|
||||
<br><br>
|
||||
If similarity data is available and domains are specified, the similarity data is used as basis for the ranking
|
||||
calculation instead, providing a much more coherent ranking.
|
||||
</small>
|
||||
</div>
|
||||
</td></tr>
|
||||
|
@ -8,25 +8,23 @@ import nu.marginalia.db.DomainRankingSetsService;
|
||||
import nu.marginalia.db.DomainTypes;
|
||||
import nu.marginalia.index.IndexServicesFactory;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
import nu.marginalia.ranking.RankingAlgorithm;
|
||||
import nu.marginalia.ranking.ReversePageRank;
|
||||
import nu.marginalia.ranking.StandardPageRank;
|
||||
import nu.marginalia.ranking.*;
|
||||
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||
import nu.marginalia.ranking.accumulator.RankingResultHashSetAccumulator;
|
||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
||||
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||
import nu.marginalia.index.svc.searchset.RankingSearchSet;
|
||||
import nu.marginalia.index.svc.searchset.SearchSetAny;
|
||||
import nu.marginalia.ranking.DomainRankings;
|
||||
import nu.marginalia.index.db.DbUpdateRanks;
|
||||
import nu.marginalia.ranking.data.GraphSource;
|
||||
import nu.marginalia.ranking.data.LinkGraphSource;
|
||||
import nu.marginalia.ranking.data.SimilarityGraphSource;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
@ -34,13 +32,12 @@ import java.util.concurrent.ConcurrentHashMap;
|
||||
public class IndexSearchSetsService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final DomainTypes domainTypes;
|
||||
private final ServiceHeartbeat heartbeat;
|
||||
private final IndexServicesFactory indexServicesFactory;
|
||||
private final ServiceEventLog eventLog;
|
||||
private final DomainRankingSetsService domainRankingSetsService;
|
||||
private final DbUpdateRanks dbUpdateRanks;
|
||||
private final RankingDomainFetcher similarityDomains;
|
||||
private final RankingDomainFetcher linksDomains;
|
||||
private final GraphSource similarityDomains;
|
||||
private final GraphSource linksDomains;
|
||||
|
||||
private final ConcurrentHashMap<String, SearchSet> rankingSets = new ConcurrentHashMap<>();
|
||||
// Below are binary indices that are used to constrain a search
|
||||
@ -55,23 +52,21 @@ public class IndexSearchSetsService {
|
||||
@Inject
|
||||
public IndexSearchSetsService(DomainTypes domainTypes,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
ServiceHeartbeat heartbeat,
|
||||
RankingDomainFetcher rankingDomains,
|
||||
RankingDomainFetcherForSimilarityData similarityDomains,
|
||||
LinkGraphSource rankingDomains,
|
||||
SimilarityGraphSource similarityDomains,
|
||||
IndexServicesFactory indexServicesFactory,
|
||||
ServiceEventLog eventLog,
|
||||
DomainRankingSetsService domainRankingSetsService,
|
||||
DbUpdateRanks dbUpdateRanks) throws IOException {
|
||||
this.nodeId = serviceConfiguration.node();
|
||||
this.domainTypes = domainTypes;
|
||||
this.heartbeat = heartbeat;
|
||||
this.indexServicesFactory = indexServicesFactory;
|
||||
this.eventLog = eventLog;
|
||||
this.domainRankingSetsService = domainRankingSetsService;
|
||||
|
||||
this.dbUpdateRanks = dbUpdateRanks;
|
||||
|
||||
if (similarityDomains.hasData()) {
|
||||
if (similarityDomains.isAvailable()) {
|
||||
this.similarityDomains = similarityDomains;
|
||||
this.linksDomains = rankingDomains;
|
||||
}
|
||||
@ -126,13 +121,13 @@ public class IndexSearchSetsService {
|
||||
}
|
||||
|
||||
try {
|
||||
if (DomainRankingSetsService.DomainSetAlgorithm.SPECIAL.equals(rankingSet.algorithm())) {
|
||||
if (rankingSet.isSpecial()) {
|
||||
switch (rankingSet.name()) {
|
||||
case "BLOGS" -> recalculateBlogsSet(rankingSet);
|
||||
case "NONE" -> {} // No-op
|
||||
}
|
||||
} else {
|
||||
recalculateNornal(rankingSet);
|
||||
recalculateNormal(rankingSet);
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
@ -142,18 +137,18 @@ public class IndexSearchSetsService {
|
||||
}
|
||||
}
|
||||
|
||||
private void recalculateNornal(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
||||
String[] domains = rankingSet.domains();
|
||||
private void recalculateNormal(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
||||
List<String> domains = List.of(rankingSet.domains());
|
||||
|
||||
RankingAlgorithm rankingAlgorithm = switch (rankingSet.algorithm()) {
|
||||
case LINKS_PAGERANK -> new StandardPageRank(linksDomains, domains);
|
||||
case LINKS_CHEIRANK -> new ReversePageRank(linksDomains, domains);
|
||||
case ADJACENCY_PAGERANK -> new StandardPageRank(similarityDomains, domains);
|
||||
case ADJACENCY_CHEIRANK -> new ReversePageRank(similarityDomains, domains);
|
||||
default -> throw new IllegalStateException("Unexpected value: " + rankingSet.algorithm());
|
||||
};
|
||||
GraphSource source;
|
||||
|
||||
var data = rankingAlgorithm.pageRankWithPeripheralNodes(rankingSet.depth(), RankingResultHashSetAccumulator::new);
|
||||
// Similarity ranking does not behave well with an empty set of domains
|
||||
if (domains.isEmpty()) source = linksDomains;
|
||||
else source = similarityDomains;
|
||||
|
||||
var data = PageRankDomainRanker
|
||||
.forDomainNames(source, domains)
|
||||
.calculate(rankingSet.depth(), RankingResultHashSetAccumulator::new);
|
||||
|
||||
var set = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), data);
|
||||
rankingSets.put(rankingSet.name(), set);
|
||||
@ -185,9 +180,21 @@ public class IndexSearchSetsService {
|
||||
}
|
||||
|
||||
private void updateDomainRankings(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
||||
List<String> domains = List.of(rankingSet.domains());
|
||||
|
||||
var spr = new StandardPageRank(similarityDomains, rankingSet.domains());
|
||||
var ranks = spr.pageRankWithPeripheralNodes(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
|
||||
final GraphSource source;
|
||||
|
||||
if (domains.isEmpty()) {
|
||||
// Similarity ranking does not behave well with an empty set of domains
|
||||
source = linksDomains;
|
||||
}
|
||||
else {
|
||||
source = similarityDomains;
|
||||
}
|
||||
|
||||
var ranks = PageRankDomainRanker
|
||||
.forDomainNames(source, domains)
|
||||
.calculate(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
|
||||
|
||||
synchronized (this) {
|
||||
domainRankings = new DomainRankings(ranks);
|
||||
|
Loading…
Reference in New Issue
Block a user