(domain-ranking) Clean up domain ranking
The domain ranking code was admittedly a bit of a clown fiesta; at the same time buggy, fragile and inscrutable. Migrating over to use JGraphT to store the link graph when doing rankings, and using their PageRank implementation. Also added a modified version that does PersonalizedPageRank.
This commit is contained in:
parent
a175b36382
commit
64acdb5f2a
@ -20,6 +20,8 @@ dependencies {
|
|||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
implementation project(':code:api:query-api')
|
implementation project(':code:api:query-api')
|
||||||
|
|
||||||
|
implementation 'org.jgrapht:jgrapht-core:1.5.2'
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.bundles.mariadb
|
||||||
implementation libs.guice
|
implementation libs.guice
|
||||||
@ -27,8 +29,16 @@ dependencies {
|
|||||||
implementation libs.roaringbitmap
|
implementation libs.roaringbitmap
|
||||||
implementation libs.trove
|
implementation libs.trove
|
||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
|
implementation libs.hll
|
||||||
|
|
||||||
|
testImplementation project(':code:libraries:array')
|
||||||
|
testImplementation libs.commons.lang3
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
minHeapSize = "128m" // initial heap size
|
||||||
|
maxHeapSize = "20G" // maximum heap size
|
||||||
|
}
|
@ -1,19 +1,34 @@
|
|||||||
# Domain Ranking
|
# Domain Ranking
|
||||||
|
|
||||||
Contains domain ranking algorithms.
|
Contains domain ranking algorithms. The domain ranking algorithms are based on
|
||||||
|
the JGraphT library.
|
||||||
|
|
||||||
|
Two principal algorithms are available, the standard PageRank algorithm,
|
||||||
|
and personalized pagerank; each are available for two graphs, the link graph
|
||||||
|
and a similarity graph where each edge corresponds to the similarity between
|
||||||
|
the sets of incident links to two domains, their cosine similarity acting as
|
||||||
|
the weight of the links.
|
||||||
|
|
||||||
|
With the standard PageRank algorithm, the similarity graph does not produce
|
||||||
|
anything useful, but something magical happens when you apply Personalized PageRank
|
||||||
|
to this graph. It turns into a very good "vibe"-sensitive ranking algorithm.
|
||||||
|
|
||||||
|
It's unclear if this is a well known result, but it's a very interesting one
|
||||||
|
for creating a ranking algorithm that is focused on a particular segment of the web.
|
||||||
|
|
||||||
## Central Classes
|
## Central Classes
|
||||||
|
|
||||||
### Algorithms
|
* [PageRankDomainRanker](src/main/java/nu/marginalia/ranking/PageRankDomainRanker.java) - Ranks domains using the
|
||||||
* [RankingAlgorithm](src/main/java/nu/marginalia/ranking/RankingAlgorithm.java)
|
PageRank or Personalized PageRank algorithm depending on whether a list of influence domains is provided.
|
||||||
* [StandardPageRank](src/main/java/nu/marginalia/ranking/StandardPageRank.java)
|
|
||||||
* [ReversePageRank](src/main/java/nu/marginalia/ranking/ReversePageRank.java) "CheiRank"
|
|
||||||
|
|
||||||
### Data sources
|
### Data sources
|
||||||
|
|
||||||
* [RankingDomainFetcher](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java) fetches link data.
|
* [LinkGraphSource](src/main/java/nu/marginalia/ranking/data/LinkGraphSource.java) - fetches the link graph
|
||||||
* [RankingDomainFetcherForSimilarityData](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java) fetches website similarity data.
|
* [InvertedLinkGraphSource](src/main/java/nu/marginalia/ranking/data/InvertedLinkGraphSource.java) - fetches the inverted link graph
|
||||||
|
* [SimilarityGraphSource](src/main/java/nu/marginalia/ranking/data/SimilarityGraphSource.java) - fetches the similarity graph from the database
|
||||||
|
|
||||||
|
Note that the similarity graph needs to be precomputed and stored in the database for
|
||||||
|
the similarity graph source to be available.
|
||||||
|
|
||||||
## See Also
|
## See Also
|
||||||
|
|
||||||
|
@ -0,0 +1,54 @@
|
|||||||
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
|
import gnu.trove.list.TIntList;
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
|
import nu.marginalia.ranking.accumulator.RankingResultAccumulator;
|
||||||
|
import nu.marginalia.ranking.data.GraphSource;
|
||||||
|
import nu.marginalia.ranking.jgrapht.PersonalizedPageRank;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.alg.interfaces.VertexScoringAlgorithm;
|
||||||
|
import org.jgrapht.alg.scoring.PageRank;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
|
public class PageRankDomainRanker implements RankingAlgorithm {
|
||||||
|
private final List<Integer> influenceSet;
|
||||||
|
private final Graph<Integer, ?> graph;
|
||||||
|
|
||||||
|
public PageRankDomainRanker(GraphSource source,
|
||||||
|
List<Integer> influenceSet)
|
||||||
|
{
|
||||||
|
this.influenceSet = influenceSet;
|
||||||
|
this.graph = source.getGraph();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <T> T calculate(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||||
|
VertexScoringAlgorithm<Integer, Double> pageRank;
|
||||||
|
|
||||||
|
if (influenceSet != null && !influenceSet.isEmpty()) {
|
||||||
|
pageRank = new PersonalizedPageRank<>(graph, influenceSet);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
pageRank = new PageRank<>(graph);
|
||||||
|
}
|
||||||
|
|
||||||
|
TIntList results = new TIntArrayList(resultCount);
|
||||||
|
pageRank.getScores().entrySet()
|
||||||
|
.stream()
|
||||||
|
.sorted(Comparator.comparing((Map.Entry<Integer, Double> e) -> -e.getValue()))
|
||||||
|
.limit(resultCount)
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.forEach(results::add);
|
||||||
|
|
||||||
|
var accumulator = accumulatorP.get();
|
||||||
|
for (int i = 0; i < results.size(); i++) {
|
||||||
|
accumulator.add(results.get(i), i);
|
||||||
|
}
|
||||||
|
return accumulator.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,281 +1,15 @@
|
|||||||
package nu.marginalia.ranking;
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
|
||||||
import gnu.trove.map.hash.TIntIntHashMap;
|
|
||||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
|
||||||
import nu.marginalia.ranking.accumulator.RankingResultAccumulator;
|
import nu.marginalia.ranking.accumulator.RankingResultAccumulator;
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
|
||||||
import nu.marginalia.ranking.data.RankingDomainData;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.function.Supplier;
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
import static java.lang.Math.min;
|
public interface RankingAlgorithm {
|
||||||
|
|
||||||
public abstract class RankingAlgorithm {
|
/** Calculate domain rankings.
|
||||||
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
*
|
||||||
protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
|
* @param resultCount update the best result count results
|
||||||
protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
|
* @param accumulatorP the accumulator to use to store the results
|
||||||
|
*/
|
||||||
protected TIntArrayList[] linkDataSrc2Dest;
|
<T> T calculate(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP);
|
||||||
protected TIntArrayList[] linkDataDest2Src;
|
|
||||||
|
|
||||||
public final Set<String> originDomains = new HashSet<>();
|
|
||||||
public final Set<Integer> originDomainIds = new HashSet<>();
|
|
||||||
|
|
||||||
private int maxKnownUrls = Integer.MAX_VALUE;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private final RankingDomainFetcher domains;
|
|
||||||
|
|
||||||
public RankingAlgorithm(RankingDomainFetcher domains, String... origins) {
|
|
||||||
this.domains = domains;
|
|
||||||
|
|
||||||
originDomains.addAll(Arrays.asList(origins));
|
|
||||||
|
|
||||||
domains.getDomains(domainData -> {
|
|
||||||
int id = domainData.id;
|
|
||||||
|
|
||||||
domainsById.put(id, domainData);
|
|
||||||
|
|
||||||
domainIndexToId.put(domainIndexToId.size(), id);
|
|
||||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
|
||||||
});
|
|
||||||
|
|
||||||
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
|
|
||||||
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
|
|
||||||
|
|
||||||
domains.eachDomainLink((src, dst) -> {
|
|
||||||
if (src == dst) return;
|
|
||||||
|
|
||||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
|
||||||
|
|
||||||
int srcIdx = domainIdToIndex.get(src);
|
|
||||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
|
||||||
|
|
||||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
|
||||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
|
||||||
|
|
||||||
if (linkDataDest2Src[dstIdx] == null) {
|
|
||||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
for (var namePattern : this.originDomains) {
|
|
||||||
domains.domainsByPattern(namePattern, i -> {
|
|
||||||
int ival = domainIdToIndex.get(i);
|
|
||||||
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
|
|
||||||
originDomainIds.add(ival);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
logger.debug("No value for {}", i);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Origin Domains: {}", originDomainIds.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
public RankingDomainData getDomainData(int id) {
|
|
||||||
return domainsById.get(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addPeripheralNodes() {
|
|
||||||
|
|
||||||
int newNodesIdxCutoff = domainIdToIndex.size();
|
|
||||||
|
|
||||||
logger.info("Inserting peripheral nodes");
|
|
||||||
|
|
||||||
domains.getPeripheralDomains(domainData -> {
|
|
||||||
int id = domainData.id;
|
|
||||||
|
|
||||||
if (domainsById.put(id, domainData) == null) { // true if id was not already present
|
|
||||||
domainIndexToId.put(domainIndexToId.size(), id);
|
|
||||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
|
|
||||||
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
|
|
||||||
|
|
||||||
domains.eachDomainLink((src, dst) -> {
|
|
||||||
if (src == dst) return;
|
|
||||||
|
|
||||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
|
||||||
int srcIdx = domainIdToIndex.get(src);
|
|
||||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
|
||||||
|
|
||||||
// This looks like a bug, but it improves the results
|
|
||||||
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
|
||||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
|
||||||
|
|
||||||
if (linkDataDest2Src[dstIdx] == null) {
|
|
||||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
return domainsById.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public <T> T pageRank(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
|
||||||
|
|
||||||
int iter_max = 100;
|
|
||||||
for (int i = 0; i < iter_max; i++) {
|
|
||||||
RankVector newRank = createNewRankVector(rank);
|
|
||||||
|
|
||||||
double oldNorm = rank.norm();
|
|
||||||
double newNorm = newRank.norm();
|
|
||||||
double dNorm = oldNorm - newNorm;
|
|
||||||
|
|
||||||
if (i < iter_max-1) {
|
|
||||||
adjustRankVector(newRank, dNorm, oldNorm);
|
|
||||||
}
|
|
||||||
|
|
||||||
rank = newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
return rank.getRanking(resultCount, accumulatorP).get();
|
|
||||||
}
|
|
||||||
|
|
||||||
public <T> T pageRankWithPeripheralNodes(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
|
||||||
|
|
||||||
int iter_max = 100;
|
|
||||||
|
|
||||||
for (int i = 0; i < iter_max; i++) {
|
|
||||||
if (i == iter_max-1) {
|
|
||||||
addPeripheralNodes();
|
|
||||||
}
|
|
||||||
RankVector newRank = createNewRankVector(rank);
|
|
||||||
|
|
||||||
double oldNorm = rank.norm();
|
|
||||||
double newNorm = newRank.norm();
|
|
||||||
double dNorm = oldNorm - newNorm;
|
|
||||||
|
|
||||||
if (i < iter_max-1) {
|
|
||||||
adjustRankVector(newRank, dNorm, oldNorm);
|
|
||||||
}
|
|
||||||
|
|
||||||
rank = newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("PRWPN iteration done");
|
|
||||||
|
|
||||||
return rank.getRanking(resultCount, accumulatorP).get();
|
|
||||||
}
|
|
||||||
|
|
||||||
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
|
|
||||||
|
|
||||||
abstract RankVector createNewRankVector(RankVector rank);
|
|
||||||
|
|
||||||
public boolean includeInRanking(RankingDomainData data) {
|
|
||||||
if (data.isAlias())
|
|
||||||
return false;
|
|
||||||
if (data.isSpecial())
|
|
||||||
return false;
|
|
||||||
if (data.isSocialMedia())
|
|
||||||
return false;
|
|
||||||
if (data.knownUrls > maxKnownUrls)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setMaxKnownUrls(int maxKnownUrls) {
|
|
||||||
this.maxKnownUrls = maxKnownUrls;
|
|
||||||
}
|
|
||||||
public class RankVector {
|
|
||||||
private final double[] rank;
|
|
||||||
|
|
||||||
public RankVector(double defaultValue) {
|
|
||||||
rank = new double[domainIndexToId.size()];
|
|
||||||
if (defaultValue != 0.) {
|
|
||||||
Arrays.fill(rank, defaultValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void set(int id, double value) {
|
|
||||||
rank[id] = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void increment(int id, double value) {
|
|
||||||
rank[id] += value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double get(int id) {
|
|
||||||
if (id >= rank.length) return 0.;
|
|
||||||
|
|
||||||
return rank[id];
|
|
||||||
}
|
|
||||||
|
|
||||||
public double norm() {
|
|
||||||
double v = 0.;
|
|
||||||
for (double value : rank) {
|
|
||||||
v += Math.abs(value);
|
|
||||||
}
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double norm(RankVector other) {
|
|
||||||
double v = 0.;
|
|
||||||
for (int i = 0; i < rank.length; i++) {
|
|
||||||
v += Math.abs(rank[i] - other.get(i));
|
|
||||||
}
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
|
|
||||||
public <T> RankingResultAccumulator<T> getRanking(int numResults, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
|
||||||
|
|
||||||
if (numResults <= 0) {
|
|
||||||
numResults = domainIdToIndex.size();
|
|
||||||
}
|
|
||||||
numResults = min(numResults, min(domainIdToIndex.size(), rank.length));
|
|
||||||
|
|
||||||
int[] nodes = sortOrder(rank);
|
|
||||||
var accumulator = accumulatorP.get();
|
|
||||||
|
|
||||||
for (int i = 0; i < numResults; i++) {
|
|
||||||
int id = domainIndexToId.get(nodes[i]);
|
|
||||||
|
|
||||||
if (includeInRanking(domainsById.get(id)))
|
|
||||||
accumulator.add(id, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
return accumulator;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int[] sortOrder(double[] values) {
|
|
||||||
|
|
||||||
int[] ret = new int[values.length];
|
|
||||||
Arrays.setAll(ret, i->i);
|
|
||||||
IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i]));
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,42 +0,0 @@
|
|||||||
package nu.marginalia.ranking;
|
|
||||||
|
|
||||||
|
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
|
||||||
|
|
||||||
public class ReversePageRank extends RankingAlgorithm {
|
|
||||||
|
|
||||||
|
|
||||||
public ReversePageRank(RankingDomainFetcher domains, String... origins) {
|
|
||||||
super(domains, origins);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
RankVector createNewRankVector(RankVector rank) {
|
|
||||||
|
|
||||||
double rankNorm = rank.norm();
|
|
||||||
RankVector newRank = new RankVector(0);
|
|
||||||
|
|
||||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
|
||||||
|
|
||||||
var links = linkDataSrc2Dest[domainId];
|
|
||||||
double newRankValue = 0;
|
|
||||||
|
|
||||||
if (links != null && links.size() > 0) {
|
|
||||||
for (int j = 0; j < links.size(); j++) {
|
|
||||||
var revLinks = linkDataDest2Src[links.getQuick(j)];
|
|
||||||
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
newRank.set(domainId, 0.85*newRankValue/rankNorm);
|
|
||||||
}
|
|
||||||
|
|
||||||
return newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
|
||||||
originDomainIds.forEach(id -> vector.increment(id, 1.0 / originDomainIds.size()));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,50 +0,0 @@
|
|||||||
package nu.marginalia.ranking;
|
|
||||||
|
|
||||||
|
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
|
||||||
|
|
||||||
public class StandardPageRank extends RankingAlgorithm {
|
|
||||||
|
|
||||||
public StandardPageRank(RankingDomainFetcher domains, String... origins) {
|
|
||||||
super(domains, origins);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
RankVector createNewRankVector(RankVector rank) {
|
|
||||||
RankVector newRank = new RankVector(0);
|
|
||||||
|
|
||||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
|
||||||
|
|
||||||
var links = linkDataDest2Src[domainId];
|
|
||||||
double newRankValue = 0;
|
|
||||||
|
|
||||||
if (links != null && links.size() > 0) {
|
|
||||||
for (int j = 0; j < links.size(); j++) {
|
|
||||||
int linkedDomain = links.getQuick(j);
|
|
||||||
|
|
||||||
final int linkSize;
|
|
||||||
var backLinks = linkDataSrc2Dest[linkedDomain];
|
|
||||||
|
|
||||||
if (backLinks == null) {
|
|
||||||
linkSize = 1;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
linkSize = backLinks.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
newRankValue += rank.get(linkedDomain) / linkSize;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
newRank.set(domainId, 0.85 * newRankValue);
|
|
||||||
}
|
|
||||||
return newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
|
||||||
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() ));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -0,0 +1,58 @@
|
|||||||
|
package nu.marginalia.ranking.data;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public abstract class AbstractGraphSource implements GraphSource {
|
||||||
|
protected final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
protected AbstractGraphSource(HikariDataSource dataSource) {
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public abstract Graph<Integer, ?> getGraph();
|
||||||
|
|
||||||
|
/** Adds all indexed domain ids as vertices to the graph. */
|
||||||
|
protected void addVertices(Graph<Integer, ?> graph) throws SQLException {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT ID
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
WHERE NODE_AFFINITY > 0
|
||||||
|
""");
|
||||||
|
var rs = stmt.executeQuery())
|
||||||
|
{
|
||||||
|
while (rs.next()) {
|
||||||
|
graph.addVertex(rs.getInt(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Integer> domainIds(List<String> domainNameList) {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT ID
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
WHERE DOMAIN_NAME IN (?)
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
stmt.setArray(1, conn.createArrayOf("VARCHAR", domainNameList.toArray()));
|
||||||
|
try (var rs = stmt.executeQuery()) {
|
||||||
|
var result = new ArrayList<Integer>();
|
||||||
|
while (rs.next()) {
|
||||||
|
result.add(rs.getInt(1));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,15 @@
|
|||||||
|
package nu.marginalia.ranking.data;
|
||||||
|
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/** A source for the link graph (or pseudo-link graph)
|
||||||
|
* to use when ranking domain. */
|
||||||
|
public interface GraphSource {
|
||||||
|
|
||||||
|
/** Construct the graph */
|
||||||
|
Graph<Integer, ?> getGraph();
|
||||||
|
|
||||||
|
List<Integer> domainIds(List<String> domainNameList);
|
||||||
|
}
|
@ -0,0 +1,49 @@
|
|||||||
|
package nu.marginalia.ranking.data;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.query.client.QueryClient;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultEdge;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/** A source for the inverted link graph,
|
||||||
|
* which is the same as the regular graph except
|
||||||
|
* the direction of the links have been inverted */
|
||||||
|
public class InvertedLinkGraphSource extends AbstractGraphSource {
|
||||||
|
private final QueryClient queryClient;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public InvertedLinkGraphSource(HikariDataSource dataSource, QueryClient queryClient) {
|
||||||
|
super(dataSource);
|
||||||
|
this.queryClient = queryClient;
|
||||||
|
}
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||||
|
|
||||||
|
addVertices(graph);
|
||||||
|
|
||||||
|
var allLinks = queryClient.getAllDomainLinks();
|
||||||
|
var iter = allLinks.iterator();
|
||||||
|
while (iter.advance()) {
|
||||||
|
if (!graph.containsVertex(iter.dest())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!graph.containsVertex(iter.source())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Invert the edge
|
||||||
|
graph.addEdge(iter.dest(), iter.source());
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,43 @@
|
|||||||
|
package nu.marginalia.ranking.data;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.query.client.QueryClient;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultEdge;
|
||||||
|
|
||||||
|
/** A source for the regular link graph. */
|
||||||
|
public class LinkGraphSource extends AbstractGraphSource {
|
||||||
|
private final QueryClient queryClient;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public LinkGraphSource(HikariDataSource dataSource, QueryClient queryClient) {
|
||||||
|
super(dataSource);
|
||||||
|
this.queryClient = queryClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||||
|
|
||||||
|
addVertices(graph);
|
||||||
|
|
||||||
|
var allLinks = queryClient.getAllDomainLinks();
|
||||||
|
var iter = allLinks.iterator();
|
||||||
|
while (iter.advance()) {
|
||||||
|
if (!graph.containsVertex(iter.dest())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!graph.containsVertex(iter.source())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
graph.addEdge(iter.source(), iter.dest());
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
}
|
@ -1,32 +0,0 @@
|
|||||||
package nu.marginalia.ranking.data;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Data;
|
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class RankingDomainData {
|
|
||||||
public final int id;
|
|
||||||
public final String name;
|
|
||||||
private int alias;
|
|
||||||
public DomainIndexingState state;
|
|
||||||
public final int knownUrls;
|
|
||||||
|
|
||||||
public int resolveAlias() {
|
|
||||||
if (alias == 0) return id;
|
|
||||||
return alias;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isAlias() {
|
|
||||||
return alias != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isSpecial() {
|
|
||||||
return DomainIndexingState.SPECIAL == state;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isSocialMedia() {
|
|
||||||
return DomainIndexingState.SOCIAL_MEDIA == state;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,138 +0,0 @@
|
|||||||
package nu.marginalia.ranking.data;
|
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
import com.google.inject.Singleton;
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import nu.marginalia.db.DomainBlacklistImpl;
|
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
|
||||||
import nu.marginalia.query.client.QueryClient;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.sql.SQLException;
|
|
||||||
import java.util.function.Consumer;
|
|
||||||
import java.util.function.IntConsumer;
|
|
||||||
|
|
||||||
@Singleton
|
|
||||||
public class RankingDomainFetcher {
|
|
||||||
protected final HikariDataSource dataSource;
|
|
||||||
private final QueryClient queryClient;
|
|
||||||
protected final DomainBlacklistImpl blacklist;
|
|
||||||
protected final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
protected boolean getNames = false;
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public RankingDomainFetcher(HikariDataSource dataSource,
|
|
||||||
QueryClient queryClient,
|
|
||||||
DomainBlacklistImpl blacklist) {
|
|
||||||
this.dataSource = dataSource;
|
|
||||||
this.queryClient = queryClient;
|
|
||||||
this.blacklist = blacklist;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void retainNames() {
|
|
||||||
this.getNames = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
|
||||||
String query;
|
|
||||||
if (getNames) {
|
|
||||||
query = """
|
|
||||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE NODE_AFFINITY>0
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
query = """
|
|
||||||
SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE NODE_AFFINITY>0
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
|
|
||||||
getDomains(query, consumer);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
|
||||||
String query;
|
|
||||||
if (getNames) {
|
|
||||||
query = """
|
|
||||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE ((INDEXED>1 AND IS_ALIVE)
|
|
||||||
OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0))
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
query = """
|
|
||||||
SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE ((INDEXED>1 AND IS_ALIVE)
|
|
||||||
OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0))
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
|
|
||||||
getDomains(query, consumer);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
|
||||||
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
|
|
||||||
stmt.setFetchSize(10000);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
int id = rsp.getInt(1);
|
|
||||||
if (!blacklist.isBlacklisted(id)) {
|
|
||||||
consumer.accept(
|
|
||||||
new RankingDomainData(id,
|
|
||||||
rsp.getString(2),
|
|
||||||
rsp.getInt(3),
|
|
||||||
DomainIndexingState.valueOf(rsp.getString(4)),
|
|
||||||
rsp.getInt(5)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
logger.error("Failed to fetch domains", ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
|
||||||
|
|
||||||
var allLinks = queryClient.getAllDomainLinks();
|
|
||||||
var iter = allLinks.iterator();
|
|
||||||
|
|
||||||
while (iter.advance()) {
|
|
||||||
consumer.accept(iter.source(), iter.dest());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
|
|
||||||
try (var conn = dataSource.getConnection();
|
|
||||||
var stmt = conn.createStatement()) {
|
|
||||||
// This is sourced from a config file --v
|
|
||||||
var rsp = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE '" + pattern + "'");
|
|
||||||
while (rsp.next()) {
|
|
||||||
idConsumer.accept(rsp.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
logger.error("Failed to fetch domains by pattern", ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public interface DomainLinkConsumer {
|
|
||||||
void accept(int from, int to);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,93 +0,0 @@
|
|||||||
package nu.marginalia.ranking.data;
|
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
import com.google.inject.Singleton;
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import nu.marginalia.db.DomainBlacklistImpl;
|
|
||||||
import nu.marginalia.query.client.QueryClient;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.sql.SQLException;
|
|
||||||
import java.util.function.Consumer;
|
|
||||||
|
|
||||||
@Singleton
|
|
||||||
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
|
|
||||||
final boolean hasData;
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, QueryClient queryClient, DomainBlacklistImpl blacklist) {
|
|
||||||
super(dataSource, queryClient, blacklist);
|
|
||||||
|
|
||||||
hasData = isDomainNeighborTablePopulated(dataSource);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) {
|
|
||||||
try (var conn = dataSource.getConnection();
|
|
||||||
var stmt = conn.createStatement();
|
|
||||||
var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) {
|
|
||||||
|
|
||||||
return rs.next();
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
LoggerFactory
|
|
||||||
.getLogger(RankingDomainFetcherForSimilarityData.class)
|
|
||||||
.error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
public boolean hasData() {
|
|
||||||
return hasData;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
|
||||||
try (var conn = dataSource.getConnection();
|
|
||||||
var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2"))
|
|
||||||
{
|
|
||||||
stmt.setFetchSize(10000);
|
|
||||||
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
|
|
||||||
while (rsp.next()) {
|
|
||||||
int src = rsp.getInt(1);
|
|
||||||
int dst = rsp.getInt(2);
|
|
||||||
|
|
||||||
// these "links" are bidi
|
|
||||||
consumer.accept(src, dst);
|
|
||||||
consumer.accept(dst, src);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
logger.error("Failed to fetch domain links", ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
|
||||||
String query;
|
|
||||||
if (getNames) {
|
|
||||||
query =
|
|
||||||
"""
|
|
||||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
query =
|
|
||||||
"""
|
|
||||||
SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
|
|
||||||
getDomains(query, consumer);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
|
||||||
// This is not relevant for this variant of pagerank since it is bidirectional
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -0,0 +1,65 @@
|
|||||||
|
package nu.marginalia.ranking.data;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
|
||||||
|
/** A source for the similarity graph, stored in EC_DOMAIN_NEIGHBORS_2,
|
||||||
|
* which contains the cosine similarity of the incident link vectors in the link graph.
|
||||||
|
* */
|
||||||
|
public class SimilarityGraphSource extends AbstractGraphSource {
|
||||||
|
@Inject
|
||||||
|
public SimilarityGraphSource(HikariDataSource dataSource) {
|
||||||
|
super(dataSource);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Check if the data source is available. */
|
||||||
|
public boolean isAvailable() {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT *
|
||||||
|
FROM EC_DOMAIN_NEIGHBORS_2
|
||||||
|
LIMIT 1
|
||||||
|
""");
|
||||||
|
var rs = stmt.executeQuery())
|
||||||
|
{
|
||||||
|
return rs.next();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
|
||||||
|
|
||||||
|
addVertices(graph);
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection()) {
|
||||||
|
try (var stmt = conn.prepareStatement("""
|
||||||
|
SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS
|
||||||
|
FROM EC_DOMAIN_NEIGHBORS_2
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
int src = rs.getInt(1);
|
||||||
|
int dest = rs.getInt(2);
|
||||||
|
double weight = rs.getDouble(3);
|
||||||
|
|
||||||
|
graph.addEdge(src, dest);
|
||||||
|
graph.setEdgeWeight(src, dest, weight);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,375 @@
|
|||||||
|
package nu.marginalia.ranking.jgrapht;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* (C) Copyright 2016-2023, by Dimitrios Michail and Contributors.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* JGraphT : a free Java graph-theory library
|
||||||
|
*
|
||||||
|
* See the CONTRIBUTORS.md file distributed with this work for additional
|
||||||
|
* information regarding copyright ownership.
|
||||||
|
*
|
||||||
|
* This program and the accompanying materials are made available under the
|
||||||
|
* terms of the Eclipse Public License 2.0 which is available at
|
||||||
|
* http://www.eclipse.org/legal/epl-2.0, or the
|
||||||
|
* GNU Lesser General Public License v2.1 or later
|
||||||
|
* which is available at
|
||||||
|
* http://www.gnu.org/licenses/old-licenses/lgpl-2.1-standalone.html.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: EPL-2.0 OR LGPL-2.1-or-later
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* (modified by @vlofgren to add personalization) */
|
||||||
|
|
||||||
|
import org.jgrapht.*;
|
||||||
|
import org.jgrapht.alg.interfaces.*;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class PersonalizedPageRank<V, E>
|
||||||
|
implements VertexScoringAlgorithm<V, Double>
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Default number of maximum iterations.
|
||||||
|
*/
|
||||||
|
public static final int MAX_ITERATIONS_DEFAULT = 100;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default value for the tolerance. The calculation will stop if the difference of PageRank
|
||||||
|
* values between iterations change less than this value.
|
||||||
|
*/
|
||||||
|
public static final double TOLERANCE_DEFAULT = 0.0001;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Damping factor default value.
|
||||||
|
*/
|
||||||
|
public static final double DAMPING_FACTOR_DEFAULT = 0.85d;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The input graph
|
||||||
|
*/
|
||||||
|
private final Graph<V, E> graph;
|
||||||
|
private final Collection<V> influenceSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The damping factor
|
||||||
|
*/
|
||||||
|
private final double dampingFactor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maximum iterations to run
|
||||||
|
*/
|
||||||
|
private final int maxIterations;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The calculation will stop if the difference of PageRank values between iterations change less
|
||||||
|
* than this value
|
||||||
|
*/
|
||||||
|
private final double tolerance;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The result
|
||||||
|
*/
|
||||||
|
private Map<V, Double> scores;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create and execute an instance of Personalized PageRank.
|
||||||
|
*
|
||||||
|
* @param graph the input graph
|
||||||
|
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||||
|
*/
|
||||||
|
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet)
|
||||||
|
{
|
||||||
|
this(graph, influenceSet, DAMPING_FACTOR_DEFAULT, MAX_ITERATIONS_DEFAULT, TOLERANCE_DEFAULT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create and execute an instance of Personalized PageRank.
|
||||||
|
*
|
||||||
|
* @param graph the input graph
|
||||||
|
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||||
|
* @param dampingFactor the damping factor
|
||||||
|
*/
|
||||||
|
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor)
|
||||||
|
{
|
||||||
|
this(graph, influenceSet, dampingFactor, MAX_ITERATIONS_DEFAULT, TOLERANCE_DEFAULT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create and execute an instance of Personalized PageRank.
|
||||||
|
*
|
||||||
|
* @param graph the input graph
|
||||||
|
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||||
|
* @param dampingFactor the damping factor
|
||||||
|
* @param maxIterations the maximum number of iterations to perform
|
||||||
|
*/
|
||||||
|
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor, int maxIterations)
|
||||||
|
{
|
||||||
|
this(graph, influenceSet, dampingFactor, maxIterations, TOLERANCE_DEFAULT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create and execute an instance of Personalized PageRank.
|
||||||
|
*
|
||||||
|
* @param graph the input graph
|
||||||
|
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||||
|
* @param dampingFactor the damping factor
|
||||||
|
* @param maxIterations the maximum number of iterations to perform
|
||||||
|
* @param tolerance the calculation will stop if the difference of Personalized PageRank values between
|
||||||
|
* iterations change less than this value
|
||||||
|
*/
|
||||||
|
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor, int maxIterations, double tolerance)
|
||||||
|
{
|
||||||
|
this.graph = graph;
|
||||||
|
this.influenceSet = influenceSet;
|
||||||
|
|
||||||
|
if (maxIterations <= 0) {
|
||||||
|
throw new IllegalArgumentException("Maximum iterations must be positive");
|
||||||
|
}
|
||||||
|
this.maxIterations = maxIterations;
|
||||||
|
|
||||||
|
if (dampingFactor < 0.0 || dampingFactor > 1.0) {
|
||||||
|
throw new IllegalArgumentException("Damping factor not valid");
|
||||||
|
}
|
||||||
|
this.dampingFactor = dampingFactor;
|
||||||
|
|
||||||
|
if (tolerance <= 0.0) {
|
||||||
|
throw new IllegalArgumentException("Tolerance not valid, must be positive");
|
||||||
|
}
|
||||||
|
this.tolerance = tolerance;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Map<V, Double> getScores()
|
||||||
|
{
|
||||||
|
if (scores == null) {
|
||||||
|
scores = Collections.unmodifiableMap(new Algorithm().getScores());
|
||||||
|
}
|
||||||
|
return scores;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Double getVertexScore(V v)
|
||||||
|
{
|
||||||
|
if (!graph.containsVertex(v)) {
|
||||||
|
throw new IllegalArgumentException("Cannot return score of unknown vertex");
|
||||||
|
}
|
||||||
|
return getScores().get(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The actual implementation.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* We use this pattern with the inner class in order to be able to cache the result but also
|
||||||
|
* allow the garbage collector to acquire all auxiliary memory used during the execution of the
|
||||||
|
* algorithm.
|
||||||
|
*
|
||||||
|
* @author Dimitrios Michail
|
||||||
|
*
|
||||||
|
* @param <V> the graph type
|
||||||
|
* @param <E> the edge type
|
||||||
|
*/
|
||||||
|
private class Algorithm
|
||||||
|
{
|
||||||
|
private int totalVertices;
|
||||||
|
private boolean isWeighted;
|
||||||
|
|
||||||
|
private Map<V, Integer> vertexIndexMap;
|
||||||
|
private V[] vertexMap;
|
||||||
|
|
||||||
|
private double[] weightSum;
|
||||||
|
private double[] curScore;
|
||||||
|
private double[] nextScore;
|
||||||
|
private int[] outDegree;
|
||||||
|
private ArrayList<int[]> adjList;
|
||||||
|
private ArrayList<double[]> weightsList;
|
||||||
|
private BitSet influenceIndexSet;
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public Algorithm()
|
||||||
|
{
|
||||||
|
this.totalVertices = graph.vertexSet().size();
|
||||||
|
this.isWeighted = graph.getType().isWeighted();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize score, map vertices to [0,n) and pre-compute degrees and adjacency lists
|
||||||
|
*/
|
||||||
|
this.curScore = new double[totalVertices];
|
||||||
|
this.nextScore = new double[totalVertices];
|
||||||
|
this.vertexIndexMap = new HashMap<>();
|
||||||
|
this.vertexMap = (V[]) new Object[totalVertices];
|
||||||
|
this.outDegree = new int[totalVertices];
|
||||||
|
this.adjList = new ArrayList<>(totalVertices);
|
||||||
|
this.influenceIndexSet = new BitSet(totalVertices);
|
||||||
|
|
||||||
|
double initScore = 1.0d / totalVertices;
|
||||||
|
int i = 0;
|
||||||
|
for (V v : graph.vertexSet()) {
|
||||||
|
vertexIndexMap.put(v, i);
|
||||||
|
vertexMap[i] = v;
|
||||||
|
outDegree[i] = graph.outDegreeOf(v);
|
||||||
|
curScore[i] = initScore;
|
||||||
|
|
||||||
|
if (influenceSet.contains(v)) {
|
||||||
|
influenceIndexSet.set(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isWeighted) {
|
||||||
|
this.weightSum = new double[totalVertices];
|
||||||
|
this.weightsList = new ArrayList<>(totalVertices);
|
||||||
|
|
||||||
|
for (i = 0; i < totalVertices; i++) {
|
||||||
|
V v = vertexMap[i];
|
||||||
|
int[] inNeighbors = new int[graph.inDegreeOf(v)];
|
||||||
|
double[] edgeWeights = new double[graph.inDegreeOf(v)];
|
||||||
|
|
||||||
|
int j = 0;
|
||||||
|
for (E e : graph.incomingEdgesOf(v)) {
|
||||||
|
V w = Graphs.getOppositeVertex(graph, e, v);
|
||||||
|
Integer mappedVertexId = vertexIndexMap.get(w);
|
||||||
|
inNeighbors[j] = mappedVertexId;
|
||||||
|
double edgeWeight = graph.getEdgeWeight(e);
|
||||||
|
edgeWeights[j] += edgeWeight;
|
||||||
|
weightSum[mappedVertexId] += edgeWeight;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
weightsList.add(edgeWeights);
|
||||||
|
adjList.add(inNeighbors);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (i = 0; i < totalVertices; i++) {
|
||||||
|
V v = vertexMap[i];
|
||||||
|
int[] inNeighbors = new int[graph.inDegreeOf(v)];
|
||||||
|
int j = 0;
|
||||||
|
for (E e : graph.incomingEdgesOf(v)) {
|
||||||
|
V w = Graphs.getOppositeVertex(graph, e, v);
|
||||||
|
inNeighbors[j++] = vertexIndexMap.get(w);
|
||||||
|
}
|
||||||
|
adjList.add(inNeighbors);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<V, Double> getScores()
|
||||||
|
{
|
||||||
|
// compute
|
||||||
|
if (isWeighted) {
|
||||||
|
runWeighted();
|
||||||
|
} else {
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
// make results user friendly
|
||||||
|
Map<V, Double> scores = new HashMap<>();
|
||||||
|
for (int i = 0; i < totalVertices; i++) {
|
||||||
|
V v = vertexMap[i];
|
||||||
|
scores.put(v, curScore[i]);
|
||||||
|
}
|
||||||
|
return scores;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void run()
|
||||||
|
{
|
||||||
|
double maxChange = tolerance;
|
||||||
|
int iterations = maxIterations;
|
||||||
|
|
||||||
|
while (iterations > 0 && maxChange >= tolerance) {
|
||||||
|
double r = teleProp();
|
||||||
|
|
||||||
|
maxChange = 0d;
|
||||||
|
for (int i = 0; i < totalVertices; i++) {
|
||||||
|
double contribution = 0d;
|
||||||
|
for (int w : adjList.get(i)) {
|
||||||
|
contribution += dampingFactor * curScore[w] / outDegree[w];
|
||||||
|
}
|
||||||
|
|
||||||
|
double vOldValue = curScore[i];
|
||||||
|
double vNewValue = (influenceIndexSet.get(i) ? r : 0) + contribution;
|
||||||
|
maxChange = Math.max(maxChange, Math.abs(vNewValue - vOldValue));
|
||||||
|
nextScore[i] = vNewValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// progress
|
||||||
|
swapScores();
|
||||||
|
iterations--;
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove influence factor from the scores
|
||||||
|
double r = teleProp();
|
||||||
|
for (int i = 0; i < totalVertices; i++) {
|
||||||
|
curScore[i] -= (influenceIndexSet.get(i) ? r : 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void runWeighted()
|
||||||
|
{
|
||||||
|
double maxChange = tolerance;
|
||||||
|
int iterations = maxIterations;
|
||||||
|
|
||||||
|
while (iterations > 0 && maxChange >= tolerance) {
|
||||||
|
double r = teleProp();
|
||||||
|
|
||||||
|
maxChange = 0d;
|
||||||
|
for (int i = 0; i < totalVertices; i++) {
|
||||||
|
double contribution = 0d;
|
||||||
|
|
||||||
|
int[] neighbors = adjList.get(i);
|
||||||
|
double[] weights = weightsList.get(i);
|
||||||
|
for (int j = 0, getLength = neighbors.length; j < getLength; j++) {
|
||||||
|
int w = neighbors[j];
|
||||||
|
contribution += dampingFactor * curScore[w] * weights[j] / weightSum[w];
|
||||||
|
}
|
||||||
|
|
||||||
|
double vOldValue = curScore[i];
|
||||||
|
double vNewValue = (influenceIndexSet.get(i) ? r : 0) + contribution;
|
||||||
|
maxChange = Math.max(maxChange, Math.abs(vNewValue - vOldValue));
|
||||||
|
nextScore[i] = vNewValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// progress
|
||||||
|
swapScores();
|
||||||
|
iterations--;
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove influence factor from the scores
|
||||||
|
double r = teleProp();
|
||||||
|
for (int i = 0; i < totalVertices; i++) {
|
||||||
|
curScore[i] -= (influenceIndexSet.get(i) ? r : 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is the teleportation part of the algorithm, and also what is modified to personalize the PageRank
|
||||||
|
private double teleProp()
|
||||||
|
{
|
||||||
|
double r = 0d;
|
||||||
|
for (int v = influenceIndexSet.nextSetBit(0);
|
||||||
|
v >= 0;
|
||||||
|
v = influenceIndexSet.nextSetBit(v + 1))
|
||||||
|
{
|
||||||
|
if (outDegree[v] > 0)
|
||||||
|
r += (1d - dampingFactor);
|
||||||
|
else
|
||||||
|
r += curScore[v];
|
||||||
|
}
|
||||||
|
return r / influenceSet.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void swapScores()
|
||||||
|
{
|
||||||
|
double[] tmp = curScore;
|
||||||
|
curScore = nextScore;
|
||||||
|
nextScore = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,75 @@
|
|||||||
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
|
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
// Test the ranking algorithm with prod data.
|
||||||
|
class RankingAlgorithmWithRealDataTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRegularPR() {
|
||||||
|
if (!TestGraphSourceForLinkData.isAvailable()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var graphSource = new TestGraphSourceForLinkData();
|
||||||
|
var results = new PageRankDomainRanker(graphSource, List.of())
|
||||||
|
.calculate(10, RankingResultListAccumulator::new);
|
||||||
|
|
||||||
|
for (int i = 0; i < results.size(); i++) {
|
||||||
|
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testInvertedLinkGraph() {
|
||||||
|
if (!TestGraphSourceForInvertedLinkData.isAvailable()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var graphSource = new TestGraphSourceForInvertedLinkData();
|
||||||
|
var results = new PageRankDomainRanker(graphSource, List.of())
|
||||||
|
.calculate(10, RankingResultListAccumulator::new);
|
||||||
|
|
||||||
|
for (int i = 0; i < results.size(); i++) {
|
||||||
|
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSimilarityPR() {
|
||||||
|
if (!TestGraphSourceForSimilarityData.isAvailable()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var graphSource = new TestGraphSourceForSimilarityData();
|
||||||
|
var results = new PageRankDomainRanker(graphSource, List.of())
|
||||||
|
.calculate(10, RankingResultListAccumulator::new);
|
||||||
|
|
||||||
|
for (int i = 0; i < results.size(); i++) {
|
||||||
|
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSimilarityPPR() {
|
||||||
|
if (!TestGraphSourceForSimilarityData.isAvailable()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var graphSource = new TestGraphSourceForSimilarityData();
|
||||||
|
var results = new PageRankDomainRanker(graphSource,
|
||||||
|
List.of(1476552) // wiby.me
|
||||||
|
)
|
||||||
|
.calculate(10, RankingResultListAccumulator::new);
|
||||||
|
|
||||||
|
for (int i = 0; i < results.size(); i++) {
|
||||||
|
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,86 @@
|
|||||||
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.ranking.data.GraphSource;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultEdge;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class TestGraphSourceForInvertedLinkData implements GraphSource {
|
||||||
|
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
|
||||||
|
private static Path[] linksDataPaths = new Path[] {
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-1.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-2.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-3.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-4.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-5.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-6.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-7.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-8.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-9.dat")
|
||||||
|
};
|
||||||
|
|
||||||
|
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
|
||||||
|
|
||||||
|
static boolean isAvailable() {
|
||||||
|
return Files.exists(domainDataPath) && Files.exists(linksDataPaths[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<Integer, String> idToName = new HashMap<>();
|
||||||
|
|
||||||
|
public String getName(int id) {
|
||||||
|
return idToName.get(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||||
|
idToName = new HashMap<>();
|
||||||
|
|
||||||
|
try (var stream = Files
|
||||||
|
.lines(domainDataPath)) {
|
||||||
|
|
||||||
|
stream.skip(1)
|
||||||
|
.mapMultiToInt((line, c) -> {
|
||||||
|
String[] parts = StringUtils.split(line, '\t');
|
||||||
|
int id = Integer.parseInt(parts[0]);
|
||||||
|
String name = parts[1];
|
||||||
|
int node_affinity = Integer.parseInt(parts[3]);
|
||||||
|
if (node_affinity > 0) {
|
||||||
|
c.accept(id);
|
||||||
|
idToName.put(id, parts[1]);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.forEach(graph::addVertex);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var path : linksDataPaths) {
|
||||||
|
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
|
||||||
|
data.forEach(0, data.size(), (pos, val) -> {
|
||||||
|
|
||||||
|
val = Long.reverseBytes(val); // data is in "java endian", LongArray is in "C endian"
|
||||||
|
|
||||||
|
int src = (int) (val >>> 32);
|
||||||
|
int dest = (int) (val & 0xFFFF_FFFFL);
|
||||||
|
|
||||||
|
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
|
||||||
|
graph.addEdge(dest, src);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,86 @@
|
|||||||
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.ranking.data.GraphSource;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultEdge;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class TestGraphSourceForLinkData implements GraphSource {
|
||||||
|
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
|
||||||
|
private static Path[] linksDataPaths = new Path[] {
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-1.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-2.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-3.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-4.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-5.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-6.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-7.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-8.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-9.dat")
|
||||||
|
};
|
||||||
|
|
||||||
|
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
|
||||||
|
|
||||||
|
static boolean isAvailable() {
|
||||||
|
return Files.exists(domainDataPath) && Files.exists(linksDataPaths[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<Integer, String> idToName = new HashMap<>();
|
||||||
|
|
||||||
|
public String getName(int id) {
|
||||||
|
return idToName.get(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||||
|
idToName = new HashMap<>();
|
||||||
|
|
||||||
|
try (var stream = Files
|
||||||
|
.lines(domainDataPath)) {
|
||||||
|
|
||||||
|
stream.skip(1)
|
||||||
|
.mapMultiToInt((line, c) -> {
|
||||||
|
String[] parts = StringUtils.split(line, '\t');
|
||||||
|
int id = Integer.parseInt(parts[0]);
|
||||||
|
String name = parts[1];
|
||||||
|
int node_affinity = Integer.parseInt(parts[3]);
|
||||||
|
if (node_affinity > 0) {
|
||||||
|
c.accept(id);
|
||||||
|
idToName.put(id, parts[1]);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.forEach(graph::addVertex);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var path : linksDataPaths) {
|
||||||
|
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
|
||||||
|
data.forEach(0, data.size(), (pos, val) -> {
|
||||||
|
|
||||||
|
val = Long.reverseBytes(val); // data is in "java endian", LongArray is in "C endian"
|
||||||
|
|
||||||
|
int src = (int) (val >>> 32);
|
||||||
|
int dest = (int) (val & 0xFFFF_FFFFL);
|
||||||
|
|
||||||
|
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
|
||||||
|
graph.addEdge(src, dest);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,78 @@
|
|||||||
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.ranking.data.GraphSource;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultEdge;
|
||||||
|
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class TestGraphSourceForSimilarityData implements GraphSource {
|
||||||
|
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
|
||||||
|
private static Path similarityDataPath = Paths.get("/home/vlofgren/Exports/Links/neighbors.tsv");
|
||||||
|
|
||||||
|
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
|
||||||
|
|
||||||
|
static boolean isAvailable() {
|
||||||
|
return Files.exists(domainDataPath) && Files.exists(similarityDataPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<Integer, String> idToName = new HashMap<>();
|
||||||
|
|
||||||
|
public String getName(int id) {
|
||||||
|
return idToName.get(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
|
||||||
|
idToName = new HashMap<>();
|
||||||
|
|
||||||
|
try (var stream = Files
|
||||||
|
.lines(domainDataPath)) {
|
||||||
|
|
||||||
|
stream.skip(1)
|
||||||
|
.mapMultiToInt((line, c) -> {
|
||||||
|
String[] parts = StringUtils.split(line, '\t');
|
||||||
|
int id = Integer.parseInt(parts[0]);
|
||||||
|
String name = parts[1];
|
||||||
|
int node_affinity = Integer.parseInt(parts[3]);
|
||||||
|
if (node_affinity > 0) {
|
||||||
|
c.accept(id);
|
||||||
|
idToName.put(id, name);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.forEach(graph::addVertex);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var stream = Files
|
||||||
|
.lines(similarityDataPath)) {
|
||||||
|
|
||||||
|
stream.skip(1)
|
||||||
|
.forEach(line -> {
|
||||||
|
String[] parts = StringUtils.split(line, '\t');
|
||||||
|
int src = Integer.parseInt(parts[0]);
|
||||||
|
int dest = Integer.parseInt(parts[1]);
|
||||||
|
double weight = Double.parseDouble(parts[2]);
|
||||||
|
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
|
||||||
|
graph.addEdge(src, dest);
|
||||||
|
graph.setEdgeWeight(src, dest, weight);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -8,17 +8,15 @@ import nu.marginalia.db.DomainRankingSetsService;
|
|||||||
import nu.marginalia.db.DomainTypes;
|
import nu.marginalia.db.DomainTypes;
|
||||||
import nu.marginalia.index.IndexServicesFactory;
|
import nu.marginalia.index.IndexServicesFactory;
|
||||||
import nu.marginalia.index.searchset.SearchSet;
|
import nu.marginalia.index.searchset.SearchSet;
|
||||||
import nu.marginalia.ranking.RankingAlgorithm;
|
import nu.marginalia.ranking.*;
|
||||||
import nu.marginalia.ranking.ReversePageRank;
|
|
||||||
import nu.marginalia.ranking.StandardPageRank;
|
|
||||||
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
|
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||||
import nu.marginalia.ranking.accumulator.RankingResultHashSetAccumulator;
|
import nu.marginalia.ranking.accumulator.RankingResultHashSetAccumulator;
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
|
||||||
import nu.marginalia.index.svc.searchset.RankingSearchSet;
|
import nu.marginalia.index.svc.searchset.RankingSearchSet;
|
||||||
import nu.marginalia.index.svc.searchset.SearchSetAny;
|
import nu.marginalia.index.svc.searchset.SearchSetAny;
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
|
||||||
import nu.marginalia.index.db.DbUpdateRanks;
|
import nu.marginalia.index.db.DbUpdateRanks;
|
||||||
|
import nu.marginalia.ranking.data.GraphSource;
|
||||||
|
import nu.marginalia.ranking.data.LinkGraphSource;
|
||||||
|
import nu.marginalia.ranking.data.SimilarityGraphSource;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
@ -27,6 +25,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
@ -34,13 +33,12 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||||||
public class IndexSearchSetsService {
|
public class IndexSearchSetsService {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final DomainTypes domainTypes;
|
private final DomainTypes domainTypes;
|
||||||
private final ServiceHeartbeat heartbeat;
|
|
||||||
private final IndexServicesFactory indexServicesFactory;
|
private final IndexServicesFactory indexServicesFactory;
|
||||||
private final ServiceEventLog eventLog;
|
private final ServiceEventLog eventLog;
|
||||||
private final DomainRankingSetsService domainRankingSetsService;
|
private final DomainRankingSetsService domainRankingSetsService;
|
||||||
private final DbUpdateRanks dbUpdateRanks;
|
private final DbUpdateRanks dbUpdateRanks;
|
||||||
private final RankingDomainFetcher similarityDomains;
|
private final GraphSource similarityDomains;
|
||||||
private final RankingDomainFetcher linksDomains;
|
private final GraphSource linksDomains;
|
||||||
|
|
||||||
private final ConcurrentHashMap<String, SearchSet> rankingSets = new ConcurrentHashMap<>();
|
private final ConcurrentHashMap<String, SearchSet> rankingSets = new ConcurrentHashMap<>();
|
||||||
// Below are binary indices that are used to constrain a search
|
// Below are binary indices that are used to constrain a search
|
||||||
@ -55,23 +53,21 @@ public class IndexSearchSetsService {
|
|||||||
@Inject
|
@Inject
|
||||||
public IndexSearchSetsService(DomainTypes domainTypes,
|
public IndexSearchSetsService(DomainTypes domainTypes,
|
||||||
ServiceConfiguration serviceConfiguration,
|
ServiceConfiguration serviceConfiguration,
|
||||||
ServiceHeartbeat heartbeat,
|
LinkGraphSource rankingDomains,
|
||||||
RankingDomainFetcher rankingDomains,
|
SimilarityGraphSource similarityDomains,
|
||||||
RankingDomainFetcherForSimilarityData similarityDomains,
|
|
||||||
IndexServicesFactory indexServicesFactory,
|
IndexServicesFactory indexServicesFactory,
|
||||||
ServiceEventLog eventLog,
|
ServiceEventLog eventLog,
|
||||||
DomainRankingSetsService domainRankingSetsService,
|
DomainRankingSetsService domainRankingSetsService,
|
||||||
DbUpdateRanks dbUpdateRanks) throws IOException {
|
DbUpdateRanks dbUpdateRanks) throws IOException {
|
||||||
this.nodeId = serviceConfiguration.node();
|
this.nodeId = serviceConfiguration.node();
|
||||||
this.domainTypes = domainTypes;
|
this.domainTypes = domainTypes;
|
||||||
this.heartbeat = heartbeat;
|
|
||||||
this.indexServicesFactory = indexServicesFactory;
|
this.indexServicesFactory = indexServicesFactory;
|
||||||
this.eventLog = eventLog;
|
this.eventLog = eventLog;
|
||||||
this.domainRankingSetsService = domainRankingSetsService;
|
this.domainRankingSetsService = domainRankingSetsService;
|
||||||
|
|
||||||
this.dbUpdateRanks = dbUpdateRanks;
|
this.dbUpdateRanks = dbUpdateRanks;
|
||||||
|
|
||||||
if (similarityDomains.hasData()) {
|
if (similarityDomains.isAvailable()) {
|
||||||
this.similarityDomains = similarityDomains;
|
this.similarityDomains = similarityDomains;
|
||||||
this.linksDomains = rankingDomains;
|
this.linksDomains = rankingDomains;
|
||||||
}
|
}
|
||||||
@ -145,15 +141,14 @@ public class IndexSearchSetsService {
|
|||||||
private void recalculateNornal(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
private void recalculateNornal(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
||||||
String[] domains = rankingSet.domains();
|
String[] domains = rankingSet.domains();
|
||||||
|
|
||||||
RankingAlgorithm rankingAlgorithm = switch (rankingSet.algorithm()) {
|
GraphSource graphSource = switch (rankingSet.algorithm()) {
|
||||||
case LINKS_PAGERANK -> new StandardPageRank(linksDomains, domains);
|
case LINKS_PAGERANK, LINKS_CHEIRANK -> linksDomains;
|
||||||
case LINKS_CHEIRANK -> new ReversePageRank(linksDomains, domains);
|
case ADJACENCY_PAGERANK, ADJACENCY_CHEIRANK -> similarityDomains;
|
||||||
case ADJACENCY_PAGERANK -> new StandardPageRank(similarityDomains, domains);
|
|
||||||
case ADJACENCY_CHEIRANK -> new ReversePageRank(similarityDomains, domains);
|
|
||||||
default -> throw new IllegalStateException("Unexpected value: " + rankingSet.algorithm());
|
default -> throw new IllegalStateException("Unexpected value: " + rankingSet.algorithm());
|
||||||
};
|
};
|
||||||
|
|
||||||
var data = rankingAlgorithm.pageRankWithPeripheralNodes(rankingSet.depth(), RankingResultHashSetAccumulator::new);
|
var data = new PageRankDomainRanker(linksDomains, linksDomains.domainIds(List.of(domains)))
|
||||||
|
.calculate(rankingSet.depth(), RankingResultHashSetAccumulator::new);
|
||||||
|
|
||||||
var set = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), data);
|
var set = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), data);
|
||||||
rankingSets.put(rankingSet.name(), set);
|
rankingSets.put(rankingSet.name(), set);
|
||||||
@ -186,8 +181,8 @@ public class IndexSearchSetsService {
|
|||||||
|
|
||||||
private void updateDomainRankings(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
private void updateDomainRankings(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
||||||
|
|
||||||
var spr = new StandardPageRank(similarityDomains, rankingSet.domains());
|
var ranks = new PageRankDomainRanker(similarityDomains, similarityDomains.domainIds(List.of(rankingSet.domains())))
|
||||||
var ranks = spr.pageRankWithPeripheralNodes(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
|
.calculate(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
|
||||||
|
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
domainRankings = new DomainRankings(ranks);
|
domainRankings = new DomainRankings(ranks);
|
||||||
|
Loading…
Reference in New Issue
Block a user