Merge pull request #80 from MarginaliaSearch/ranking-algorithms
Clean up domain ranking code
This commit is contained in:
commit
d05c916491
@ -25,7 +25,7 @@ public class DomainRankingSetsService {
|
|||||||
public Optional<DomainRankingSet> get(String name) throws SQLException {
|
public Optional<DomainRankingSet> get(String name) throws SQLException {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var stmt = conn.prepareStatement("""
|
var stmt = conn.prepareStatement("""
|
||||||
SELECT NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION
|
SELECT NAME, DESCRIPTION, DEPTH, DEFINITION
|
||||||
FROM CONF_DOMAIN_RANKING_SET
|
FROM CONF_DOMAIN_RANKING_SET
|
||||||
WHERE NAME = ?
|
WHERE NAME = ?
|
||||||
""")) {
|
""")) {
|
||||||
@ -39,7 +39,6 @@ public class DomainRankingSetsService {
|
|||||||
return Optional.of(new DomainRankingSet(
|
return Optional.of(new DomainRankingSet(
|
||||||
rs.getString("NAME"),
|
rs.getString("NAME"),
|
||||||
rs.getString("DESCRIPTION"),
|
rs.getString("DESCRIPTION"),
|
||||||
DomainSetAlgorithm.valueOf(rs.getString("ALGORITHM")),
|
|
||||||
rs.getInt("DEPTH"),
|
rs.getInt("DEPTH"),
|
||||||
rs.getString("DEFINITION")
|
rs.getString("DEFINITION")
|
||||||
));
|
));
|
||||||
@ -53,15 +52,14 @@ public class DomainRankingSetsService {
|
|||||||
public void upsert(DomainRankingSet domainRankingSet) {
|
public void upsert(DomainRankingSet domainRankingSet) {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var stmt = conn.prepareStatement("""
|
var stmt = conn.prepareStatement("""
|
||||||
REPLACE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION)
|
REPLACE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, DEPTH, DEFINITION)
|
||||||
VALUES (?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?)
|
||||||
"""))
|
"""))
|
||||||
{
|
{
|
||||||
stmt.setString(1, domainRankingSet.name());
|
stmt.setString(1, domainRankingSet.name());
|
||||||
stmt.setString(2, domainRankingSet.description());
|
stmt.setString(2, domainRankingSet.description());
|
||||||
stmt.setString(3, domainRankingSet.algorithm().name());
|
stmt.setInt(3, domainRankingSet.depth());
|
||||||
stmt.setInt(4, domainRankingSet.depth());
|
stmt.setString(4, domainRankingSet.definition());
|
||||||
stmt.setString(5, domainRankingSet.definition());
|
|
||||||
stmt.executeUpdate();
|
stmt.executeUpdate();
|
||||||
|
|
||||||
if (!conn.getAutoCommit())
|
if (!conn.getAutoCommit())
|
||||||
@ -94,7 +92,7 @@ public class DomainRankingSetsService {
|
|||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var stmt = conn.prepareStatement("""
|
var stmt = conn.prepareStatement("""
|
||||||
SELECT NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION
|
SELECT NAME, DESCRIPTION, DEPTH, DEFINITION
|
||||||
FROM CONF_DOMAIN_RANKING_SET
|
FROM CONF_DOMAIN_RANKING_SET
|
||||||
""")) {
|
""")) {
|
||||||
var rs = stmt.executeQuery();
|
var rs = stmt.executeQuery();
|
||||||
@ -105,7 +103,6 @@ public class DomainRankingSetsService {
|
|||||||
new DomainRankingSet(
|
new DomainRankingSet(
|
||||||
rs.getString("NAME"),
|
rs.getString("NAME"),
|
||||||
rs.getString("DESCRIPTION"),
|
rs.getString("DESCRIPTION"),
|
||||||
DomainSetAlgorithm.valueOf(rs.getString("ALGORITHM")),
|
|
||||||
rs.getInt("DEPTH"),
|
rs.getInt("DEPTH"),
|
||||||
rs.getString("DEFINITION"))
|
rs.getString("DEFINITION"))
|
||||||
);
|
);
|
||||||
@ -118,31 +115,17 @@ public class DomainRankingSetsService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum DomainSetAlgorithm {
|
|
||||||
/** Use link graph, do a pagerank */
|
|
||||||
LINKS_PAGERANK,
|
|
||||||
/** Use link graph, do a cheirank */
|
|
||||||
LINKS_CHEIRANK,
|
|
||||||
/** Use adjacency graph, do a pagerank */
|
|
||||||
ADJACENCY_PAGERANK,
|
|
||||||
/** Use adjacency graph, do a cheirank */
|
|
||||||
ADJACENCY_CHEIRANK,
|
|
||||||
/** For reserved names. Use special algorithm, function of name */
|
|
||||||
SPECIAL
|
|
||||||
};
|
|
||||||
|
|
||||||
/** Defines a domain ranking set, parameters for the ranking algorithms.
|
/** Defines a domain ranking set, parameters for the ranking algorithms.
|
||||||
*
|
*
|
||||||
* @param name Key and name of the set
|
* @param name Key and name of the set
|
||||||
* @param description Human-readable description
|
* @param description Human-readable description
|
||||||
* @param algorithm Algorithm to use
|
|
||||||
* @param depth Depth of the algorithm
|
* @param depth Depth of the algorithm
|
||||||
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
||||||
* */
|
* */
|
||||||
@With
|
@With
|
||||||
public record DomainRankingSet(String name,
|
public record DomainRankingSet(String name,
|
||||||
String description,
|
String description,
|
||||||
DomainSetAlgorithm algorithm,
|
|
||||||
int depth,
|
int depth,
|
||||||
String definition)
|
String definition)
|
||||||
{
|
{
|
||||||
@ -159,7 +142,7 @@ public class DomainRankingSetsService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean isSpecial() {
|
public boolean isSpecial() {
|
||||||
return algorithm() == DomainSetAlgorithm.SPECIAL;
|
return name().equals("BLOGS") || name().equals("NONE") || name().equals("RANK");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1 @@
|
|||||||
|
ALTER TABLE CONF_DOMAIN_RANKING_SET DROP COLUMN ALGORITHM;
|
@ -56,14 +56,12 @@ class DomainRankingSetsServiceTest {
|
|||||||
var newValue = new DomainRankingSetsService.DomainRankingSet(
|
var newValue = new DomainRankingSetsService.DomainRankingSet(
|
||||||
"test",
|
"test",
|
||||||
"Test domain set",
|
"Test domain set",
|
||||||
DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_CHEIRANK,
|
|
||||||
10,
|
10,
|
||||||
"test\\.nu"
|
"test\\.nu"
|
||||||
);
|
);
|
||||||
var newValue2 = new DomainRankingSetsService.DomainRankingSet(
|
var newValue2 = new DomainRankingSetsService.DomainRankingSet(
|
||||||
"test2",
|
"test2",
|
||||||
"Test domain set 2",
|
"Test domain set 2",
|
||||||
DomainRankingSetsService.DomainSetAlgorithm.LINKS_PAGERANK,
|
|
||||||
20,
|
20,
|
||||||
"test\\.nu 2"
|
"test\\.nu 2"
|
||||||
);
|
);
|
||||||
|
@ -20,6 +20,8 @@ dependencies {
|
|||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
implementation project(':code:api:query-api')
|
implementation project(':code:api:query-api')
|
||||||
|
|
||||||
|
implementation 'org.jgrapht:jgrapht-core:1.5.2'
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.bundles.mariadb
|
||||||
implementation libs.guice
|
implementation libs.guice
|
||||||
@ -27,8 +29,17 @@ dependencies {
|
|||||||
implementation libs.roaringbitmap
|
implementation libs.roaringbitmap
|
||||||
implementation libs.trove
|
implementation libs.trove
|
||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
|
implementation libs.hll
|
||||||
|
|
||||||
|
testImplementation project(':code:libraries:array')
|
||||||
|
|
||||||
|
testImplementation libs.commons.lang3
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
}
|
|
||||||
|
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||||
|
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||||
|
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||||
|
testImplementation project(':code:libraries:test-helpers')
|
||||||
|
}
|
@ -1,19 +1,34 @@
|
|||||||
# Domain Ranking
|
# Domain Ranking
|
||||||
|
|
||||||
Contains domain ranking algorithms.
|
Contains domain ranking algorithms. The domain ranking algorithms are based on
|
||||||
|
the JGraphT library.
|
||||||
|
|
||||||
|
Two principal algorithms are available, the standard PageRank algorithm,
|
||||||
|
and personalized pagerank; each are available for two graphs, the link graph
|
||||||
|
and a similarity graph where each edge corresponds to the similarity between
|
||||||
|
the sets of incident links to two domains, their cosine similarity acting as
|
||||||
|
the weight of the links.
|
||||||
|
|
||||||
|
With the standard PageRank algorithm, the similarity graph does not produce
|
||||||
|
anything useful, but something magical happens when you apply Personalized PageRank
|
||||||
|
to this graph. It turns into a very good "vibe"-sensitive ranking algorithm.
|
||||||
|
|
||||||
|
It's unclear if this is a well known result, but it's a very interesting one
|
||||||
|
for creating a ranking algorithm that is focused on a particular segment of the web.
|
||||||
|
|
||||||
## Central Classes
|
## Central Classes
|
||||||
|
|
||||||
### Algorithms
|
* [PageRankDomainRanker](src/main/java/nu/marginalia/ranking/PageRankDomainRanker.java) - Ranks domains using the
|
||||||
* [RankingAlgorithm](src/main/java/nu/marginalia/ranking/RankingAlgorithm.java)
|
PageRank or Personalized PageRank algorithm depending on whether a list of influence domains is provided.
|
||||||
* [StandardPageRank](src/main/java/nu/marginalia/ranking/StandardPageRank.java)
|
|
||||||
* [ReversePageRank](src/main/java/nu/marginalia/ranking/ReversePageRank.java) "CheiRank"
|
|
||||||
|
|
||||||
### Data sources
|
### Data sources
|
||||||
|
|
||||||
* [RankingDomainFetcher](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java) fetches link data.
|
* [LinkGraphSource](src/main/java/nu/marginalia/ranking/data/LinkGraphSource.java) - fetches the link graph
|
||||||
* [RankingDomainFetcherForSimilarityData](src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java) fetches website similarity data.
|
* [InvertedLinkGraphSource](src/main/java/nu/marginalia/ranking/data/InvertedLinkGraphSource.java) - fetches the inverted link graph
|
||||||
|
* [SimilarityGraphSource](src/main/java/nu/marginalia/ranking/data/SimilarityGraphSource.java) - fetches the similarity graph from the database
|
||||||
|
|
||||||
|
Note that the similarity graph needs to be precomputed and stored in the database for
|
||||||
|
the similarity graph source to be available.
|
||||||
|
|
||||||
## See Also
|
## See Also
|
||||||
|
|
||||||
|
@ -0,0 +1,60 @@
|
|||||||
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
|
import gnu.trove.list.TIntList;
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
|
import nu.marginalia.ranking.accumulator.RankingResultAccumulator;
|
||||||
|
import nu.marginalia.ranking.data.GraphSource;
|
||||||
|
import nu.marginalia.ranking.jgrapht.PersonalizedPageRank;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.alg.interfaces.VertexScoringAlgorithm;
|
||||||
|
import org.jgrapht.alg.scoring.PageRank;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
|
public class PageRankDomainRanker implements RankingAlgorithm {
|
||||||
|
private final List<Integer> influenceSet;
|
||||||
|
private final Graph<Integer, ?> graph;
|
||||||
|
|
||||||
|
public PageRankDomainRanker(GraphSource source,
|
||||||
|
List<Integer> influenceSet)
|
||||||
|
{
|
||||||
|
this.influenceSet = influenceSet;
|
||||||
|
this.graph = source.getGraph();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static PageRankDomainRanker forDomainNames(GraphSource source,
|
||||||
|
List<String> influenceSet)
|
||||||
|
{
|
||||||
|
return new PageRankDomainRanker(source, source.domainIds(influenceSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <T> T calculate(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||||
|
VertexScoringAlgorithm<Integer, Double> pageRank;
|
||||||
|
|
||||||
|
if (influenceSet != null && !influenceSet.isEmpty()) {
|
||||||
|
pageRank = new PersonalizedPageRank<>(graph, influenceSet);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
pageRank = new PageRank<>(graph);
|
||||||
|
}
|
||||||
|
|
||||||
|
TIntList results = new TIntArrayList(resultCount);
|
||||||
|
pageRank.getScores().entrySet()
|
||||||
|
.stream()
|
||||||
|
.sorted(Comparator.comparing((Map.Entry<Integer, Double> e) -> -e.getValue()))
|
||||||
|
.limit(resultCount)
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.forEach(results::add);
|
||||||
|
|
||||||
|
var accumulator = accumulatorP.get();
|
||||||
|
for (int i = 0; i < results.size(); i++) {
|
||||||
|
accumulator.add(results.get(i), i);
|
||||||
|
}
|
||||||
|
return accumulator.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,281 +1,15 @@
|
|||||||
package nu.marginalia.ranking;
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
|
||||||
import gnu.trove.map.hash.TIntIntHashMap;
|
|
||||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
|
||||||
import nu.marginalia.ranking.accumulator.RankingResultAccumulator;
|
import nu.marginalia.ranking.accumulator.RankingResultAccumulator;
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
|
||||||
import nu.marginalia.ranking.data.RankingDomainData;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.function.Supplier;
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
import static java.lang.Math.min;
|
public interface RankingAlgorithm {
|
||||||
|
|
||||||
public abstract class RankingAlgorithm {
|
/** Calculate domain rankings.
|
||||||
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
*
|
||||||
protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
|
* @param resultCount update the best result count results
|
||||||
protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
|
* @param accumulatorP the accumulator to use to store the results
|
||||||
|
*/
|
||||||
protected TIntArrayList[] linkDataSrc2Dest;
|
<T> T calculate(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP);
|
||||||
protected TIntArrayList[] linkDataDest2Src;
|
|
||||||
|
|
||||||
public final Set<String> originDomains = new HashSet<>();
|
|
||||||
public final Set<Integer> originDomainIds = new HashSet<>();
|
|
||||||
|
|
||||||
private int maxKnownUrls = Integer.MAX_VALUE;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private final RankingDomainFetcher domains;
|
|
||||||
|
|
||||||
public RankingAlgorithm(RankingDomainFetcher domains, String... origins) {
|
|
||||||
this.domains = domains;
|
|
||||||
|
|
||||||
originDomains.addAll(Arrays.asList(origins));
|
|
||||||
|
|
||||||
domains.getDomains(domainData -> {
|
|
||||||
int id = domainData.id;
|
|
||||||
|
|
||||||
domainsById.put(id, domainData);
|
|
||||||
|
|
||||||
domainIndexToId.put(domainIndexToId.size(), id);
|
|
||||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
|
||||||
});
|
|
||||||
|
|
||||||
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
|
|
||||||
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
|
|
||||||
|
|
||||||
domains.eachDomainLink((src, dst) -> {
|
|
||||||
if (src == dst) return;
|
|
||||||
|
|
||||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
|
||||||
|
|
||||||
int srcIdx = domainIdToIndex.get(src);
|
|
||||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
|
||||||
|
|
||||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
|
||||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
|
||||||
|
|
||||||
if (linkDataDest2Src[dstIdx] == null) {
|
|
||||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
for (var namePattern : this.originDomains) {
|
|
||||||
domains.domainsByPattern(namePattern, i -> {
|
|
||||||
int ival = domainIdToIndex.get(i);
|
|
||||||
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
|
|
||||||
originDomainIds.add(ival);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
logger.debug("No value for {}", i);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Origin Domains: {}", originDomainIds.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
public RankingDomainData getDomainData(int id) {
|
|
||||||
return domainsById.get(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addPeripheralNodes() {
|
|
||||||
|
|
||||||
int newNodesIdxCutoff = domainIdToIndex.size();
|
|
||||||
|
|
||||||
logger.info("Inserting peripheral nodes");
|
|
||||||
|
|
||||||
domains.getPeripheralDomains(domainData -> {
|
|
||||||
int id = domainData.id;
|
|
||||||
|
|
||||||
if (domainsById.put(id, domainData) == null) { // true if id was not already present
|
|
||||||
domainIndexToId.put(domainIndexToId.size(), id);
|
|
||||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
|
|
||||||
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
|
|
||||||
|
|
||||||
domains.eachDomainLink((src, dst) -> {
|
|
||||||
if (src == dst) return;
|
|
||||||
|
|
||||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
|
||||||
int srcIdx = domainIdToIndex.get(src);
|
|
||||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
|
||||||
|
|
||||||
// This looks like a bug, but it improves the results
|
|
||||||
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
|
||||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
|
||||||
|
|
||||||
if (linkDataDest2Src[dstIdx] == null) {
|
|
||||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
return domainsById.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public <T> T pageRank(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
|
||||||
|
|
||||||
int iter_max = 100;
|
|
||||||
for (int i = 0; i < iter_max; i++) {
|
|
||||||
RankVector newRank = createNewRankVector(rank);
|
|
||||||
|
|
||||||
double oldNorm = rank.norm();
|
|
||||||
double newNorm = newRank.norm();
|
|
||||||
double dNorm = oldNorm - newNorm;
|
|
||||||
|
|
||||||
if (i < iter_max-1) {
|
|
||||||
adjustRankVector(newRank, dNorm, oldNorm);
|
|
||||||
}
|
|
||||||
|
|
||||||
rank = newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
return rank.getRanking(resultCount, accumulatorP).get();
|
|
||||||
}
|
|
||||||
|
|
||||||
public <T> T pageRankWithPeripheralNodes(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
|
||||||
|
|
||||||
int iter_max = 100;
|
|
||||||
|
|
||||||
for (int i = 0; i < iter_max; i++) {
|
|
||||||
if (i == iter_max-1) {
|
|
||||||
addPeripheralNodes();
|
|
||||||
}
|
|
||||||
RankVector newRank = createNewRankVector(rank);
|
|
||||||
|
|
||||||
double oldNorm = rank.norm();
|
|
||||||
double newNorm = newRank.norm();
|
|
||||||
double dNorm = oldNorm - newNorm;
|
|
||||||
|
|
||||||
if (i < iter_max-1) {
|
|
||||||
adjustRankVector(newRank, dNorm, oldNorm);
|
|
||||||
}
|
|
||||||
|
|
||||||
rank = newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("PRWPN iteration done");
|
|
||||||
|
|
||||||
return rank.getRanking(resultCount, accumulatorP).get();
|
|
||||||
}
|
|
||||||
|
|
||||||
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
|
|
||||||
|
|
||||||
abstract RankVector createNewRankVector(RankVector rank);
|
|
||||||
|
|
||||||
public boolean includeInRanking(RankingDomainData data) {
|
|
||||||
if (data.isAlias())
|
|
||||||
return false;
|
|
||||||
if (data.isSpecial())
|
|
||||||
return false;
|
|
||||||
if (data.isSocialMedia())
|
|
||||||
return false;
|
|
||||||
if (data.knownUrls > maxKnownUrls)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setMaxKnownUrls(int maxKnownUrls) {
|
|
||||||
this.maxKnownUrls = maxKnownUrls;
|
|
||||||
}
|
|
||||||
public class RankVector {
|
|
||||||
private final double[] rank;
|
|
||||||
|
|
||||||
public RankVector(double defaultValue) {
|
|
||||||
rank = new double[domainIndexToId.size()];
|
|
||||||
if (defaultValue != 0.) {
|
|
||||||
Arrays.fill(rank, defaultValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void set(int id, double value) {
|
|
||||||
rank[id] = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void increment(int id, double value) {
|
|
||||||
rank[id] += value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double get(int id) {
|
|
||||||
if (id >= rank.length) return 0.;
|
|
||||||
|
|
||||||
return rank[id];
|
|
||||||
}
|
|
||||||
|
|
||||||
public double norm() {
|
|
||||||
double v = 0.;
|
|
||||||
for (double value : rank) {
|
|
||||||
v += Math.abs(value);
|
|
||||||
}
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double norm(RankVector other) {
|
|
||||||
double v = 0.;
|
|
||||||
for (int i = 0; i < rank.length; i++) {
|
|
||||||
v += Math.abs(rank[i] - other.get(i));
|
|
||||||
}
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
|
|
||||||
public <T> RankingResultAccumulator<T> getRanking(int numResults, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
|
||||||
|
|
||||||
if (numResults <= 0) {
|
|
||||||
numResults = domainIdToIndex.size();
|
|
||||||
}
|
|
||||||
numResults = min(numResults, min(domainIdToIndex.size(), rank.length));
|
|
||||||
|
|
||||||
int[] nodes = sortOrder(rank);
|
|
||||||
var accumulator = accumulatorP.get();
|
|
||||||
|
|
||||||
for (int i = 0; i < numResults; i++) {
|
|
||||||
int id = domainIndexToId.get(nodes[i]);
|
|
||||||
|
|
||||||
if (includeInRanking(domainsById.get(id)))
|
|
||||||
accumulator.add(id, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
return accumulator;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int[] sortOrder(double[] values) {
|
|
||||||
|
|
||||||
int[] ret = new int[values.length];
|
|
||||||
Arrays.setAll(ret, i->i);
|
|
||||||
IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i]));
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,42 +0,0 @@
|
|||||||
package nu.marginalia.ranking;
|
|
||||||
|
|
||||||
|
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
|
||||||
|
|
||||||
public class ReversePageRank extends RankingAlgorithm {
|
|
||||||
|
|
||||||
|
|
||||||
public ReversePageRank(RankingDomainFetcher domains, String... origins) {
|
|
||||||
super(domains, origins);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
RankVector createNewRankVector(RankVector rank) {
|
|
||||||
|
|
||||||
double rankNorm = rank.norm();
|
|
||||||
RankVector newRank = new RankVector(0);
|
|
||||||
|
|
||||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
|
||||||
|
|
||||||
var links = linkDataSrc2Dest[domainId];
|
|
||||||
double newRankValue = 0;
|
|
||||||
|
|
||||||
if (links != null && links.size() > 0) {
|
|
||||||
for (int j = 0; j < links.size(); j++) {
|
|
||||||
var revLinks = linkDataDest2Src[links.getQuick(j)];
|
|
||||||
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
newRank.set(domainId, 0.85*newRankValue/rankNorm);
|
|
||||||
}
|
|
||||||
|
|
||||||
return newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
|
||||||
originDomainIds.forEach(id -> vector.increment(id, 1.0 / originDomainIds.size()));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,50 +0,0 @@
|
|||||||
package nu.marginalia.ranking;
|
|
||||||
|
|
||||||
|
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
|
||||||
|
|
||||||
public class StandardPageRank extends RankingAlgorithm {
|
|
||||||
|
|
||||||
public StandardPageRank(RankingDomainFetcher domains, String... origins) {
|
|
||||||
super(domains, origins);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
RankVector createNewRankVector(RankVector rank) {
|
|
||||||
RankVector newRank = new RankVector(0);
|
|
||||||
|
|
||||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
|
||||||
|
|
||||||
var links = linkDataDest2Src[domainId];
|
|
||||||
double newRankValue = 0;
|
|
||||||
|
|
||||||
if (links != null && links.size() > 0) {
|
|
||||||
for (int j = 0; j < links.size(); j++) {
|
|
||||||
int linkedDomain = links.getQuick(j);
|
|
||||||
|
|
||||||
final int linkSize;
|
|
||||||
var backLinks = linkDataSrc2Dest[linkedDomain];
|
|
||||||
|
|
||||||
if (backLinks == null) {
|
|
||||||
linkSize = 1;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
linkSize = backLinks.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
newRankValue += rank.get(linkedDomain) / linkSize;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
newRank.set(domainId, 0.85 * newRankValue);
|
|
||||||
}
|
|
||||||
return newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
|
||||||
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() ));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -0,0 +1,63 @@
|
|||||||
|
package nu.marginalia.ranking.data;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public abstract class AbstractGraphSource implements GraphSource {
|
||||||
|
protected final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
protected AbstractGraphSource(HikariDataSource dataSource) {
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public abstract Graph<Integer, ?> getGraph();
|
||||||
|
|
||||||
|
/** Adds all indexed domain ids as vertices to the graph. */
|
||||||
|
protected void addVertices(Graph<Integer, ?> graph) throws SQLException {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT ID
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
WHERE NODE_AFFINITY > 0
|
||||||
|
""");
|
||||||
|
var rs = stmt.executeQuery())
|
||||||
|
{
|
||||||
|
while (rs.next()) {
|
||||||
|
graph.addVertex(rs.getInt(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Integer> domainIds(List<String> domainNameList) {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT ID
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
WHERE DOMAIN_NAME LIKE ?
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
Set<Integer> retSet = new HashSet<>();
|
||||||
|
|
||||||
|
for (String domainName : domainNameList) {
|
||||||
|
stmt.setString(1, domainName);
|
||||||
|
try (var rs = stmt.executeQuery()) {
|
||||||
|
while (rs.next()) {
|
||||||
|
retSet.add(rs.getInt(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var ret = new ArrayList<>(retSet);
|
||||||
|
ret.sort(Comparator.naturalOrder());
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,23 @@
|
|||||||
|
package nu.marginalia.ranking.data;
|
||||||
|
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/** A source for the link graph (or pseudo-link graph)
|
||||||
|
* to use when ranking domain. */
|
||||||
|
public interface GraphSource {
|
||||||
|
|
||||||
|
/** Construct the graph */
|
||||||
|
Graph<Integer, ?> getGraph();
|
||||||
|
|
||||||
|
/** Return a list of domain ids for the given domain names.
|
||||||
|
* The function will also accept SQL-style wildcards,
|
||||||
|
* e.g. "%marginalia.nu" will match "marginalia.nu" and "memex.marginalia.nu".
|
||||||
|
* <p></p>
|
||||||
|
* If multiple wildcards are provided, and overlapping domains are matched,
|
||||||
|
* they will be included only once. The returned list will be sorted in
|
||||||
|
* numerical order of the domain IDs.
|
||||||
|
*/
|
||||||
|
List<Integer> domainIds(List<String> domainNameList);
|
||||||
|
}
|
@ -0,0 +1,49 @@
|
|||||||
|
package nu.marginalia.ranking.data;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.query.client.QueryClient;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultEdge;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/** A source for the inverted link graph,
|
||||||
|
* which is the same as the regular graph except
|
||||||
|
* the direction of the links have been inverted */
|
||||||
|
public class InvertedLinkGraphSource extends AbstractGraphSource {
|
||||||
|
private final QueryClient queryClient;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public InvertedLinkGraphSource(HikariDataSource dataSource, QueryClient queryClient) {
|
||||||
|
super(dataSource);
|
||||||
|
this.queryClient = queryClient;
|
||||||
|
}
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||||
|
|
||||||
|
addVertices(graph);
|
||||||
|
|
||||||
|
var allLinks = queryClient.getAllDomainLinks();
|
||||||
|
var iter = allLinks.iterator();
|
||||||
|
while (iter.advance()) {
|
||||||
|
if (!graph.containsVertex(iter.dest())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!graph.containsVertex(iter.source())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Invert the edge
|
||||||
|
graph.addEdge(iter.dest(), iter.source());
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,43 @@
|
|||||||
|
package nu.marginalia.ranking.data;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.query.client.QueryClient;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultEdge;
|
||||||
|
|
||||||
|
/** A source for the regular link graph. */
|
||||||
|
public class LinkGraphSource extends AbstractGraphSource {
|
||||||
|
private final QueryClient queryClient;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public LinkGraphSource(HikariDataSource dataSource, QueryClient queryClient) {
|
||||||
|
super(dataSource);
|
||||||
|
this.queryClient = queryClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||||
|
|
||||||
|
addVertices(graph);
|
||||||
|
|
||||||
|
var allLinks = queryClient.getAllDomainLinks();
|
||||||
|
var iter = allLinks.iterator();
|
||||||
|
while (iter.advance()) {
|
||||||
|
if (!graph.containsVertex(iter.dest())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!graph.containsVertex(iter.source())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
graph.addEdge(iter.source(), iter.dest());
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
}
|
@ -1,32 +0,0 @@
|
|||||||
package nu.marginalia.ranking.data;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Data;
|
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class RankingDomainData {
|
|
||||||
public final int id;
|
|
||||||
public final String name;
|
|
||||||
private int alias;
|
|
||||||
public DomainIndexingState state;
|
|
||||||
public final int knownUrls;
|
|
||||||
|
|
||||||
public int resolveAlias() {
|
|
||||||
if (alias == 0) return id;
|
|
||||||
return alias;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isAlias() {
|
|
||||||
return alias != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isSpecial() {
|
|
||||||
return DomainIndexingState.SPECIAL == state;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isSocialMedia() {
|
|
||||||
return DomainIndexingState.SOCIAL_MEDIA == state;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,138 +0,0 @@
|
|||||||
package nu.marginalia.ranking.data;
|
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
import com.google.inject.Singleton;
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import nu.marginalia.db.DomainBlacklistImpl;
|
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
|
||||||
import nu.marginalia.query.client.QueryClient;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.sql.SQLException;
|
|
||||||
import java.util.function.Consumer;
|
|
||||||
import java.util.function.IntConsumer;
|
|
||||||
|
|
||||||
@Singleton
|
|
||||||
public class RankingDomainFetcher {
|
|
||||||
protected final HikariDataSource dataSource;
|
|
||||||
private final QueryClient queryClient;
|
|
||||||
protected final DomainBlacklistImpl blacklist;
|
|
||||||
protected final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
protected boolean getNames = false;
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public RankingDomainFetcher(HikariDataSource dataSource,
|
|
||||||
QueryClient queryClient,
|
|
||||||
DomainBlacklistImpl blacklist) {
|
|
||||||
this.dataSource = dataSource;
|
|
||||||
this.queryClient = queryClient;
|
|
||||||
this.blacklist = blacklist;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void retainNames() {
|
|
||||||
this.getNames = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
|
||||||
String query;
|
|
||||||
if (getNames) {
|
|
||||||
query = """
|
|
||||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE NODE_AFFINITY>0
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
query = """
|
|
||||||
SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE NODE_AFFINITY>0
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
|
|
||||||
getDomains(query, consumer);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
|
||||||
String query;
|
|
||||||
if (getNames) {
|
|
||||||
query = """
|
|
||||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE ((INDEXED>1 AND IS_ALIVE)
|
|
||||||
OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0))
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
query = """
|
|
||||||
SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE ((INDEXED>1 AND IS_ALIVE)
|
|
||||||
OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0))
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
|
|
||||||
getDomains(query, consumer);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
|
||||||
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
|
|
||||||
stmt.setFetchSize(10000);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
int id = rsp.getInt(1);
|
|
||||||
if (!blacklist.isBlacklisted(id)) {
|
|
||||||
consumer.accept(
|
|
||||||
new RankingDomainData(id,
|
|
||||||
rsp.getString(2),
|
|
||||||
rsp.getInt(3),
|
|
||||||
DomainIndexingState.valueOf(rsp.getString(4)),
|
|
||||||
rsp.getInt(5)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
logger.error("Failed to fetch domains", ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
|
||||||
|
|
||||||
var allLinks = queryClient.getAllDomainLinks();
|
|
||||||
var iter = allLinks.iterator();
|
|
||||||
|
|
||||||
while (iter.advance()) {
|
|
||||||
consumer.accept(iter.source(), iter.dest());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
|
|
||||||
try (var conn = dataSource.getConnection();
|
|
||||||
var stmt = conn.createStatement()) {
|
|
||||||
// This is sourced from a config file --v
|
|
||||||
var rsp = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE '" + pattern + "'");
|
|
||||||
while (rsp.next()) {
|
|
||||||
idConsumer.accept(rsp.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
logger.error("Failed to fetch domains by pattern", ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public interface DomainLinkConsumer {
|
|
||||||
void accept(int from, int to);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,93 +0,0 @@
|
|||||||
package nu.marginalia.ranking.data;
|
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
import com.google.inject.Singleton;
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import nu.marginalia.db.DomainBlacklistImpl;
|
|
||||||
import nu.marginalia.query.client.QueryClient;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.sql.SQLException;
|
|
||||||
import java.util.function.Consumer;
|
|
||||||
|
|
||||||
@Singleton
|
|
||||||
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
|
|
||||||
final boolean hasData;
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, QueryClient queryClient, DomainBlacklistImpl blacklist) {
|
|
||||||
super(dataSource, queryClient, blacklist);
|
|
||||||
|
|
||||||
hasData = isDomainNeighborTablePopulated(dataSource);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) {
|
|
||||||
try (var conn = dataSource.getConnection();
|
|
||||||
var stmt = conn.createStatement();
|
|
||||||
var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) {
|
|
||||||
|
|
||||||
return rs.next();
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
LoggerFactory
|
|
||||||
.getLogger(RankingDomainFetcherForSimilarityData.class)
|
|
||||||
.error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
public boolean hasData() {
|
|
||||||
return hasData;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
|
||||||
try (var conn = dataSource.getConnection();
|
|
||||||
var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2"))
|
|
||||||
{
|
|
||||||
stmt.setFetchSize(10000);
|
|
||||||
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
|
|
||||||
while (rsp.next()) {
|
|
||||||
int src = rsp.getInt(1);
|
|
||||||
int dst = rsp.getInt(2);
|
|
||||||
|
|
||||||
// these "links" are bidi
|
|
||||||
consumer.accept(src, dst);
|
|
||||||
consumer.accept(dst, src);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
logger.error("Failed to fetch domain links", ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
|
||||||
String query;
|
|
||||||
if (getNames) {
|
|
||||||
query =
|
|
||||||
"""
|
|
||||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
query =
|
|
||||||
"""
|
|
||||||
SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
GROUP BY EC_DOMAIN.ID
|
|
||||||
""";
|
|
||||||
}
|
|
||||||
|
|
||||||
getDomains(query, consumer);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
|
||||||
// This is not relevant for this variant of pagerank since it is bidirectional
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -0,0 +1,65 @@
|
|||||||
|
package nu.marginalia.ranking.data;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
|
||||||
|
/** A source for the similarity graph, stored in EC_DOMAIN_NEIGHBORS_2,
|
||||||
|
* which contains the cosine similarity of the incident link vectors in the link graph.
|
||||||
|
* */
|
||||||
|
public class SimilarityGraphSource extends AbstractGraphSource {
|
||||||
|
@Inject
|
||||||
|
public SimilarityGraphSource(HikariDataSource dataSource) {
|
||||||
|
super(dataSource);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Check if the data source is available. */
|
||||||
|
public boolean isAvailable() {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT *
|
||||||
|
FROM EC_DOMAIN_NEIGHBORS_2
|
||||||
|
LIMIT 1
|
||||||
|
""");
|
||||||
|
var rs = stmt.executeQuery())
|
||||||
|
{
|
||||||
|
return rs.next();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
|
||||||
|
|
||||||
|
addVertices(graph);
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection()) {
|
||||||
|
try (var stmt = conn.prepareStatement("""
|
||||||
|
SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS
|
||||||
|
FROM EC_DOMAIN_NEIGHBORS_2
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
int src = rs.getInt(1);
|
||||||
|
int dest = rs.getInt(2);
|
||||||
|
double weight = rs.getDouble(3);
|
||||||
|
|
||||||
|
graph.addEdge(src, dest);
|
||||||
|
graph.setEdgeWeight(src, dest, weight);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,375 @@
|
|||||||
|
package nu.marginalia.ranking.jgrapht;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* (C) Copyright 2016-2023, by Dimitrios Michail and Contributors.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* JGraphT : a free Java graph-theory library
|
||||||
|
*
|
||||||
|
* See the CONTRIBUTORS.md file distributed with this work for additional
|
||||||
|
* information regarding copyright ownership.
|
||||||
|
*
|
||||||
|
* This program and the accompanying materials are made available under the
|
||||||
|
* terms of the Eclipse Public License 2.0 which is available at
|
||||||
|
* http://www.eclipse.org/legal/epl-2.0, or the
|
||||||
|
* GNU Lesser General Public License v2.1 or later
|
||||||
|
* which is available at
|
||||||
|
* http://www.gnu.org/licenses/old-licenses/lgpl-2.1-standalone.html.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: EPL-2.0 OR LGPL-2.1-or-later
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* (modified by @vlofgren to add personalization) */
|
||||||
|
|
||||||
|
import org.jgrapht.*;
|
||||||
|
import org.jgrapht.alg.interfaces.*;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class PersonalizedPageRank<V, E>
|
||||||
|
implements VertexScoringAlgorithm<V, Double>
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Default number of maximum iterations.
|
||||||
|
*/
|
||||||
|
public static final int MAX_ITERATIONS_DEFAULT = 100;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default value for the tolerance. The calculation will stop if the difference of PageRank
|
||||||
|
* values between iterations change less than this value.
|
||||||
|
*/
|
||||||
|
public static final double TOLERANCE_DEFAULT = 0.0001;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Damping factor default value.
|
||||||
|
*/
|
||||||
|
public static final double DAMPING_FACTOR_DEFAULT = 0.85d;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The input graph
|
||||||
|
*/
|
||||||
|
private final Graph<V, E> graph;
|
||||||
|
private final Collection<V> influenceSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The damping factor
|
||||||
|
*/
|
||||||
|
private final double dampingFactor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maximum iterations to run
|
||||||
|
*/
|
||||||
|
private final int maxIterations;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The calculation will stop if the difference of PageRank values between iterations change less
|
||||||
|
* than this value
|
||||||
|
*/
|
||||||
|
private final double tolerance;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The result
|
||||||
|
*/
|
||||||
|
private Map<V, Double> scores;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create and execute an instance of Personalized PageRank.
|
||||||
|
*
|
||||||
|
* @param graph the input graph
|
||||||
|
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||||
|
*/
|
||||||
|
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet)
|
||||||
|
{
|
||||||
|
this(graph, influenceSet, DAMPING_FACTOR_DEFAULT, MAX_ITERATIONS_DEFAULT, TOLERANCE_DEFAULT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create and execute an instance of Personalized PageRank.
|
||||||
|
*
|
||||||
|
* @param graph the input graph
|
||||||
|
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||||
|
* @param dampingFactor the damping factor
|
||||||
|
*/
|
||||||
|
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor)
|
||||||
|
{
|
||||||
|
this(graph, influenceSet, dampingFactor, MAX_ITERATIONS_DEFAULT, TOLERANCE_DEFAULT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create and execute an instance of Personalized PageRank.
|
||||||
|
*
|
||||||
|
* @param graph the input graph
|
||||||
|
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||||
|
* @param dampingFactor the damping factor
|
||||||
|
* @param maxIterations the maximum number of iterations to perform
|
||||||
|
*/
|
||||||
|
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor, int maxIterations)
|
||||||
|
{
|
||||||
|
this(graph, influenceSet, dampingFactor, maxIterations, TOLERANCE_DEFAULT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create and execute an instance of Personalized PageRank.
|
||||||
|
*
|
||||||
|
* @param graph the input graph
|
||||||
|
* @param influenceSet the set of vertices to personalize the Personalized PageRank calculation
|
||||||
|
* @param dampingFactor the damping factor
|
||||||
|
* @param maxIterations the maximum number of iterations to perform
|
||||||
|
* @param tolerance the calculation will stop if the difference of Personalized PageRank values between
|
||||||
|
* iterations change less than this value
|
||||||
|
*/
|
||||||
|
public PersonalizedPageRank(Graph<V, E> graph, Collection<V> influenceSet, double dampingFactor, int maxIterations, double tolerance)
|
||||||
|
{
|
||||||
|
this.graph = graph;
|
||||||
|
this.influenceSet = influenceSet;
|
||||||
|
|
||||||
|
if (maxIterations <= 0) {
|
||||||
|
throw new IllegalArgumentException("Maximum iterations must be positive");
|
||||||
|
}
|
||||||
|
this.maxIterations = maxIterations;
|
||||||
|
|
||||||
|
if (dampingFactor < 0.0 || dampingFactor > 1.0) {
|
||||||
|
throw new IllegalArgumentException("Damping factor not valid");
|
||||||
|
}
|
||||||
|
this.dampingFactor = dampingFactor;
|
||||||
|
|
||||||
|
if (tolerance <= 0.0) {
|
||||||
|
throw new IllegalArgumentException("Tolerance not valid, must be positive");
|
||||||
|
}
|
||||||
|
this.tolerance = tolerance;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Map<V, Double> getScores()
|
||||||
|
{
|
||||||
|
if (scores == null) {
|
||||||
|
scores = Collections.unmodifiableMap(new Algorithm().getScores());
|
||||||
|
}
|
||||||
|
return scores;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Double getVertexScore(V v)
|
||||||
|
{
|
||||||
|
if (!graph.containsVertex(v)) {
|
||||||
|
throw new IllegalArgumentException("Cannot return score of unknown vertex");
|
||||||
|
}
|
||||||
|
return getScores().get(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The actual implementation.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* We use this pattern with the inner class in order to be able to cache the result but also
|
||||||
|
* allow the garbage collector to acquire all auxiliary memory used during the execution of the
|
||||||
|
* algorithm.
|
||||||
|
*
|
||||||
|
* @author Dimitrios Michail
|
||||||
|
*
|
||||||
|
* @param <V> the graph type
|
||||||
|
* @param <E> the edge type
|
||||||
|
*/
|
||||||
|
private class Algorithm
|
||||||
|
{
|
||||||
|
private int totalVertices;
|
||||||
|
private boolean isWeighted;
|
||||||
|
|
||||||
|
private Map<V, Integer> vertexIndexMap;
|
||||||
|
private V[] vertexMap;
|
||||||
|
|
||||||
|
private double[] weightSum;
|
||||||
|
private double[] curScore;
|
||||||
|
private double[] nextScore;
|
||||||
|
private int[] outDegree;
|
||||||
|
private ArrayList<int[]> adjList;
|
||||||
|
private ArrayList<double[]> weightsList;
|
||||||
|
private BitSet influenceIndexSet;
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public Algorithm()
|
||||||
|
{
|
||||||
|
this.totalVertices = graph.vertexSet().size();
|
||||||
|
this.isWeighted = graph.getType().isWeighted();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize score, map vertices to [0,n) and pre-compute degrees and adjacency lists
|
||||||
|
*/
|
||||||
|
this.curScore = new double[totalVertices];
|
||||||
|
this.nextScore = new double[totalVertices];
|
||||||
|
this.vertexIndexMap = new HashMap<>();
|
||||||
|
this.vertexMap = (V[]) new Object[totalVertices];
|
||||||
|
this.outDegree = new int[totalVertices];
|
||||||
|
this.adjList = new ArrayList<>(totalVertices);
|
||||||
|
this.influenceIndexSet = new BitSet(totalVertices);
|
||||||
|
|
||||||
|
double initScore = 1.0d / totalVertices;
|
||||||
|
int i = 0;
|
||||||
|
for (V v : graph.vertexSet()) {
|
||||||
|
vertexIndexMap.put(v, i);
|
||||||
|
vertexMap[i] = v;
|
||||||
|
outDegree[i] = graph.outDegreeOf(v);
|
||||||
|
curScore[i] = initScore;
|
||||||
|
|
||||||
|
if (influenceSet.contains(v)) {
|
||||||
|
influenceIndexSet.set(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isWeighted) {
|
||||||
|
this.weightSum = new double[totalVertices];
|
||||||
|
this.weightsList = new ArrayList<>(totalVertices);
|
||||||
|
|
||||||
|
for (i = 0; i < totalVertices; i++) {
|
||||||
|
V v = vertexMap[i];
|
||||||
|
int[] inNeighbors = new int[graph.inDegreeOf(v)];
|
||||||
|
double[] edgeWeights = new double[graph.inDegreeOf(v)];
|
||||||
|
|
||||||
|
int j = 0;
|
||||||
|
for (E e : graph.incomingEdgesOf(v)) {
|
||||||
|
V w = Graphs.getOppositeVertex(graph, e, v);
|
||||||
|
Integer mappedVertexId = vertexIndexMap.get(w);
|
||||||
|
inNeighbors[j] = mappedVertexId;
|
||||||
|
double edgeWeight = graph.getEdgeWeight(e);
|
||||||
|
edgeWeights[j] += edgeWeight;
|
||||||
|
weightSum[mappedVertexId] += edgeWeight;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
weightsList.add(edgeWeights);
|
||||||
|
adjList.add(inNeighbors);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (i = 0; i < totalVertices; i++) {
|
||||||
|
V v = vertexMap[i];
|
||||||
|
int[] inNeighbors = new int[graph.inDegreeOf(v)];
|
||||||
|
int j = 0;
|
||||||
|
for (E e : graph.incomingEdgesOf(v)) {
|
||||||
|
V w = Graphs.getOppositeVertex(graph, e, v);
|
||||||
|
inNeighbors[j++] = vertexIndexMap.get(w);
|
||||||
|
}
|
||||||
|
adjList.add(inNeighbors);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<V, Double> getScores()
|
||||||
|
{
|
||||||
|
// compute
|
||||||
|
if (isWeighted) {
|
||||||
|
runWeighted();
|
||||||
|
} else {
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
// make results user friendly
|
||||||
|
Map<V, Double> scores = new HashMap<>();
|
||||||
|
for (int i = 0; i < totalVertices; i++) {
|
||||||
|
V v = vertexMap[i];
|
||||||
|
scores.put(v, curScore[i]);
|
||||||
|
}
|
||||||
|
return scores;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void run()
|
||||||
|
{
|
||||||
|
double maxChange = tolerance;
|
||||||
|
int iterations = maxIterations;
|
||||||
|
|
||||||
|
while (iterations > 0 && maxChange >= tolerance) {
|
||||||
|
double r = teleProp();
|
||||||
|
|
||||||
|
maxChange = 0d;
|
||||||
|
for (int i = 0; i < totalVertices; i++) {
|
||||||
|
double contribution = 0d;
|
||||||
|
for (int w : adjList.get(i)) {
|
||||||
|
contribution += dampingFactor * curScore[w] / outDegree[w];
|
||||||
|
}
|
||||||
|
|
||||||
|
double vOldValue = curScore[i];
|
||||||
|
double vNewValue = (influenceIndexSet.get(i) ? r : 0) + contribution;
|
||||||
|
maxChange = Math.max(maxChange, Math.abs(vNewValue - vOldValue));
|
||||||
|
nextScore[i] = vNewValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// progress
|
||||||
|
swapScores();
|
||||||
|
iterations--;
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove influence factor from the scores
|
||||||
|
double r = teleProp();
|
||||||
|
for (int i = 0; i < totalVertices; i++) {
|
||||||
|
curScore[i] -= (influenceIndexSet.get(i) ? r : 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void runWeighted()
|
||||||
|
{
|
||||||
|
double maxChange = tolerance;
|
||||||
|
int iterations = maxIterations;
|
||||||
|
|
||||||
|
while (iterations > 0 && maxChange >= tolerance) {
|
||||||
|
double r = teleProp();
|
||||||
|
|
||||||
|
maxChange = 0d;
|
||||||
|
for (int i = 0; i < totalVertices; i++) {
|
||||||
|
double contribution = 0d;
|
||||||
|
|
||||||
|
int[] neighbors = adjList.get(i);
|
||||||
|
double[] weights = weightsList.get(i);
|
||||||
|
for (int j = 0, getLength = neighbors.length; j < getLength; j++) {
|
||||||
|
int w = neighbors[j];
|
||||||
|
contribution += dampingFactor * curScore[w] * weights[j] / weightSum[w];
|
||||||
|
}
|
||||||
|
|
||||||
|
double vOldValue = curScore[i];
|
||||||
|
double vNewValue = (influenceIndexSet.get(i) ? r : 0) + contribution;
|
||||||
|
maxChange = Math.max(maxChange, Math.abs(vNewValue - vOldValue));
|
||||||
|
nextScore[i] = vNewValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// progress
|
||||||
|
swapScores();
|
||||||
|
iterations--;
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove influence factor from the scores
|
||||||
|
double r = teleProp();
|
||||||
|
for (int i = 0; i < totalVertices; i++) {
|
||||||
|
curScore[i] -= (influenceIndexSet.get(i) ? r : 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is the teleportation part of the algorithm, and also what is modified to personalize the PageRank
|
||||||
|
private double teleProp()
|
||||||
|
{
|
||||||
|
double r = 0d;
|
||||||
|
for (int v = influenceIndexSet.nextSetBit(0);
|
||||||
|
v >= 0;
|
||||||
|
v = influenceIndexSet.nextSetBit(v + 1))
|
||||||
|
{
|
||||||
|
if (outDegree[v] > 0)
|
||||||
|
r += (1d - dampingFactor);
|
||||||
|
else
|
||||||
|
r += curScore[v];
|
||||||
|
}
|
||||||
|
return r / influenceSet.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void swapScores()
|
||||||
|
{
|
||||||
|
double[] tmp = curScore;
|
||||||
|
curScore = nextScore;
|
||||||
|
nextScore = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,78 @@
|
|||||||
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
|
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
// Test the ranking algorithm with prod data. Will not run if the data is not available.
|
||||||
|
// It's not feasible to include the data in the git repo, as it's ~6 GB of data.
|
||||||
|
@Disabled
|
||||||
|
class RankingAlgorithmWithRealDataTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRegularPR() {
|
||||||
|
if (!TestGraphSourceForLinkData.isAvailable()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var graphSource = new TestGraphSourceForLinkData();
|
||||||
|
var results = new PageRankDomainRanker(graphSource, List.of())
|
||||||
|
.calculate(10, RankingResultListAccumulator::new);
|
||||||
|
|
||||||
|
for (int i = 0; i < results.size(); i++) {
|
||||||
|
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testInvertedLinkGraph() {
|
||||||
|
if (!TestGraphSourceForInvertedLinkData.isAvailable()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var graphSource = new TestGraphSourceForInvertedLinkData();
|
||||||
|
var results = new PageRankDomainRanker(graphSource, List.of())
|
||||||
|
.calculate(10, RankingResultListAccumulator::new);
|
||||||
|
|
||||||
|
for (int i = 0; i < results.size(); i++) {
|
||||||
|
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSimilarityPR() {
|
||||||
|
if (!TestGraphSourceForSimilarityData.isAvailable()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var graphSource = new TestGraphSourceForSimilarityData();
|
||||||
|
var results = new PageRankDomainRanker(graphSource, List.of())
|
||||||
|
.calculate(10, RankingResultListAccumulator::new);
|
||||||
|
|
||||||
|
for (int i = 0; i < results.size(); i++) {
|
||||||
|
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSimilarityPPR() {
|
||||||
|
if (!TestGraphSourceForSimilarityData.isAvailable()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var graphSource = new TestGraphSourceForSimilarityData();
|
||||||
|
var results = new PageRankDomainRanker(graphSource,
|
||||||
|
List.of(1476552) // wiby.me
|
||||||
|
)
|
||||||
|
.calculate(10, RankingResultListAccumulator::new);
|
||||||
|
|
||||||
|
for (int i = 0; i < results.size(); i++) {
|
||||||
|
System.out.println(i + " " + graphSource.getName(results.get(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,161 @@
|
|||||||
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariConfig;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.query.client.QueryClient;
|
||||||
|
import nu.marginalia.ranking.data.InvertedLinkGraphSource;
|
||||||
|
import nu.marginalia.ranking.data.LinkGraphSource;
|
||||||
|
import nu.marginalia.ranking.data.SimilarityGraphSource;
|
||||||
|
import nu.marginalia.test.TestMigrationLoader;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
import org.junit.jupiter.api.parallel.Execution;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
import org.testcontainers.containers.MariaDBContainer;
|
||||||
|
import org.testcontainers.junit.jupiter.Container;
|
||||||
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
@Tag("slow")
|
||||||
|
@Testcontainers
|
||||||
|
@Execution(SAME_THREAD)
|
||||||
|
public class RankingAlgorithmsContainerTest {
|
||||||
|
@Container
|
||||||
|
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||||
|
.withDatabaseName("WMSA_prod")
|
||||||
|
.withUsername("wmsa")
|
||||||
|
.withPassword("wmsa")
|
||||||
|
.withNetworkAliases("mariadb");
|
||||||
|
|
||||||
|
static HikariDataSource dataSource;
|
||||||
|
|
||||||
|
QueryClient queryClient;
|
||||||
|
QueryClient.AllLinks allLinks;
|
||||||
|
@BeforeAll
|
||||||
|
public static void setup() {
|
||||||
|
HikariConfig config = new HikariConfig();
|
||||||
|
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||||
|
config.setUsername("wmsa");
|
||||||
|
config.setPassword("wmsa");
|
||||||
|
|
||||||
|
dataSource = new HikariDataSource(config);
|
||||||
|
TestMigrationLoader.flywayMigration(dataSource);
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.createStatement()) {
|
||||||
|
stmt.executeUpdate("""
|
||||||
|
INSERT INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
|
||||||
|
VALUES ('memex.marginalia.nu', 'marginalia.nu', 1),
|
||||||
|
('search.marginalia.nu', 'marginalia.nu', 1),
|
||||||
|
('encyclopedia.marginalia.nu', 'marginalia.nu', 1),
|
||||||
|
('marginalia.nu', 'marginalia.nu', 1);
|
||||||
|
""");
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setupQueryClient() {
|
||||||
|
queryClient = Mockito.mock(QueryClient.class);
|
||||||
|
allLinks = new QueryClient.AllLinks();
|
||||||
|
when(queryClient.getAllDomainLinks()).thenReturn(allLinks);
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.createStatement()) {
|
||||||
|
stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_NEIGHBORS_2");
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addSimilarity(int source, int dest, double similarity) {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
INSERT INTO EC_DOMAIN_NEIGHBORS_2(DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS)
|
||||||
|
VALUES (?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setInt(1, source);
|
||||||
|
stmt.setInt(2, dest);
|
||||||
|
stmt.setDouble(3, similarity);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetDomains() {
|
||||||
|
// should all be the same, doesn't matter which one we use
|
||||||
|
var source = new LinkGraphSource(dataSource, queryClient);
|
||||||
|
|
||||||
|
Assertions.assertEquals(List.of(1),
|
||||||
|
source.domainIds(List.of("memex.marginalia.nu")));
|
||||||
|
|
||||||
|
// Verify globbing
|
||||||
|
Assertions.assertEquals(List.of(1,2,3),
|
||||||
|
source.domainIds(List.of("%.marginalia.nu")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinkGraphSource() {
|
||||||
|
allLinks.add(1, 3);
|
||||||
|
|
||||||
|
var graph = new LinkGraphSource(dataSource, queryClient).getGraph();
|
||||||
|
|
||||||
|
Assertions.assertTrue(graph.containsVertex(1));
|
||||||
|
Assertions.assertTrue(graph.containsVertex(2));
|
||||||
|
Assertions.assertTrue(graph.containsVertex(3));
|
||||||
|
|
||||||
|
Assertions.assertTrue(graph.containsEdge(1, 3));
|
||||||
|
|
||||||
|
Assertions.assertFalse(graph.containsEdge(3, 1));
|
||||||
|
Assertions.assertFalse(graph.containsEdge(2, 3));
|
||||||
|
Assertions.assertFalse(graph.containsEdge(3, 2));
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testInvertedLinkGraphSource() {
|
||||||
|
allLinks.add(1, 3);
|
||||||
|
|
||||||
|
var graph = new InvertedLinkGraphSource(dataSource, queryClient).getGraph();
|
||||||
|
|
||||||
|
Assertions.assertTrue(graph.containsVertex(1));
|
||||||
|
Assertions.assertTrue(graph.containsVertex(2));
|
||||||
|
Assertions.assertTrue(graph.containsVertex(3));
|
||||||
|
|
||||||
|
Assertions.assertTrue(graph.containsEdge(3, 1));
|
||||||
|
|
||||||
|
Assertions.assertFalse(graph.containsEdge(1, 3));
|
||||||
|
Assertions.assertFalse(graph.containsEdge(2, 3));
|
||||||
|
Assertions.assertFalse(graph.containsEdge(3, 2));
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public void testSimilarityGraphSource() {
|
||||||
|
|
||||||
|
addSimilarity(1, 3, 0.5);
|
||||||
|
|
||||||
|
var graph = (Graph<Integer, DefaultWeightedEdge>) new SimilarityGraphSource(dataSource).getGraph();
|
||||||
|
|
||||||
|
Assertions.assertTrue(graph.containsVertex(1));
|
||||||
|
Assertions.assertTrue(graph.containsVertex(2));
|
||||||
|
Assertions.assertTrue(graph.containsVertex(3));
|
||||||
|
|
||||||
|
Assertions.assertTrue(graph.containsEdge(3, 1));
|
||||||
|
Assertions.assertTrue(graph.containsEdge(1, 3));
|
||||||
|
Assertions.assertEquals(graph.getEdgeWeight(graph.getEdge(1, 3)), 0.5, 0.0001);
|
||||||
|
|
||||||
|
Assertions.assertFalse(graph.containsEdge(1, 2));
|
||||||
|
Assertions.assertFalse(graph.containsEdge(2, 3));
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,86 @@
|
|||||||
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.ranking.data.GraphSource;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultEdge;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class TestGraphSourceForInvertedLinkData implements GraphSource {
|
||||||
|
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
|
||||||
|
private static Path[] linksDataPaths = new Path[] {
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-1.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-2.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-3.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-4.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-5.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-6.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-7.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-8.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-9.dat")
|
||||||
|
};
|
||||||
|
|
||||||
|
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
|
||||||
|
|
||||||
|
static boolean isAvailable() {
|
||||||
|
return Files.exists(domainDataPath) && Files.exists(linksDataPaths[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<Integer, String> idToName = new HashMap<>();
|
||||||
|
|
||||||
|
public String getName(int id) {
|
||||||
|
return idToName.get(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||||
|
idToName = new HashMap<>();
|
||||||
|
|
||||||
|
try (var stream = Files
|
||||||
|
.lines(domainDataPath)) {
|
||||||
|
|
||||||
|
stream.skip(1)
|
||||||
|
.mapMultiToInt((line, c) -> {
|
||||||
|
String[] parts = StringUtils.split(line, '\t');
|
||||||
|
int id = Integer.parseInt(parts[0]);
|
||||||
|
String name = parts[1];
|
||||||
|
int node_affinity = Integer.parseInt(parts[3]);
|
||||||
|
if (node_affinity > 0) {
|
||||||
|
c.accept(id);
|
||||||
|
idToName.put(id, parts[1]);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.forEach(graph::addVertex);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var path : linksDataPaths) {
|
||||||
|
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
|
||||||
|
data.forEach(0, data.size(), (pos, val) -> {
|
||||||
|
|
||||||
|
val = Long.reverseBytes(val); // data is in "java endian", LongArray is in "C endian"
|
||||||
|
|
||||||
|
int src = (int) (val >>> 32);
|
||||||
|
int dest = (int) (val & 0xFFFF_FFFFL);
|
||||||
|
|
||||||
|
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
|
||||||
|
graph.addEdge(dest, src);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,86 @@
|
|||||||
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.ranking.data.GraphSource;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultEdge;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class TestGraphSourceForLinkData implements GraphSource {
|
||||||
|
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
|
||||||
|
private static Path[] linksDataPaths = new Path[] {
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-1.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-2.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-3.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-4.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-5.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-6.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-7.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-8.dat"),
|
||||||
|
Paths.get("/home/vlofgren/Exports/Links/domain-links-9.dat")
|
||||||
|
};
|
||||||
|
|
||||||
|
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
|
||||||
|
|
||||||
|
static boolean isAvailable() {
|
||||||
|
return Files.exists(domainDataPath) && Files.exists(linksDataPaths[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<Integer, String> idToName = new HashMap<>();
|
||||||
|
|
||||||
|
public String getName(int id) {
|
||||||
|
return idToName.get(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||||
|
idToName = new HashMap<>();
|
||||||
|
|
||||||
|
try (var stream = Files
|
||||||
|
.lines(domainDataPath)) {
|
||||||
|
|
||||||
|
stream.skip(1)
|
||||||
|
.mapMultiToInt((line, c) -> {
|
||||||
|
String[] parts = StringUtils.split(line, '\t');
|
||||||
|
int id = Integer.parseInt(parts[0]);
|
||||||
|
String name = parts[1];
|
||||||
|
int node_affinity = Integer.parseInt(parts[3]);
|
||||||
|
if (node_affinity > 0) {
|
||||||
|
c.accept(id);
|
||||||
|
idToName.put(id, parts[1]);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.forEach(graph::addVertex);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var path : linksDataPaths) {
|
||||||
|
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
|
||||||
|
data.forEach(0, data.size(), (pos, val) -> {
|
||||||
|
|
||||||
|
val = Long.reverseBytes(val); // data is in "java endian", LongArray is in "C endian"
|
||||||
|
|
||||||
|
int src = (int) (val >>> 32);
|
||||||
|
int dest = (int) (val & 0xFFFF_FFFFL);
|
||||||
|
|
||||||
|
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
|
||||||
|
graph.addEdge(src, dest);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,78 @@
|
|||||||
|
package nu.marginalia.ranking;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.ranking.data.GraphSource;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jgrapht.Graph;
|
||||||
|
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultEdge;
|
||||||
|
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
|
||||||
|
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class TestGraphSourceForSimilarityData implements GraphSource {
|
||||||
|
private static Path domainDataPath = Paths.get("/home/vlofgren/Exports/Links/domains.export.tsv");
|
||||||
|
private static Path similarityDataPath = Paths.get("/home/vlofgren/Exports/Links/neighbors.tsv");
|
||||||
|
|
||||||
|
public List<Integer> domainIds(List<String> domainNameList) { return List.of(); }
|
||||||
|
|
||||||
|
static boolean isAvailable() {
|
||||||
|
return Files.exists(domainDataPath) && Files.exists(similarityDataPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<Integer, String> idToName = new HashMap<>();
|
||||||
|
|
||||||
|
public String getName(int id) {
|
||||||
|
return idToName.get(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Graph<Integer, ?> getGraph() {
|
||||||
|
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
|
||||||
|
idToName = new HashMap<>();
|
||||||
|
|
||||||
|
try (var stream = Files
|
||||||
|
.lines(domainDataPath)) {
|
||||||
|
|
||||||
|
stream.skip(1)
|
||||||
|
.mapMultiToInt((line, c) -> {
|
||||||
|
String[] parts = StringUtils.split(line, '\t');
|
||||||
|
int id = Integer.parseInt(parts[0]);
|
||||||
|
String name = parts[1];
|
||||||
|
int node_affinity = Integer.parseInt(parts[3]);
|
||||||
|
if (node_affinity > 0) {
|
||||||
|
c.accept(id);
|
||||||
|
idToName.put(id, name);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.forEach(graph::addVertex);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var stream = Files
|
||||||
|
.lines(similarityDataPath)) {
|
||||||
|
|
||||||
|
stream.skip(1)
|
||||||
|
.forEach(line -> {
|
||||||
|
String[] parts = StringUtils.split(line, '\t');
|
||||||
|
int src = Integer.parseInt(parts[0]);
|
||||||
|
int dest = Integer.parseInt(parts[1]);
|
||||||
|
double weight = Double.parseDouble(parts[2]);
|
||||||
|
if (graph.containsVertex(src) && graph.containsVertex(dest)) {
|
||||||
|
graph.addEdge(src, dest);
|
||||||
|
graph.setEdgeWeight(src, dest, weight);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -15,15 +15,12 @@ import java.sql.SQLException;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
public class ControlDomainRankingSetsService {
|
public class ControlDomainRankingSetsService {
|
||||||
private final HikariDataSource dataSource;
|
|
||||||
private final ControlRendererFactory rendererFactory;
|
private final ControlRendererFactory rendererFactory;
|
||||||
private final DomainRankingSetsService domainRankingSetsService;
|
private final DomainRankingSetsService domainRankingSetsService;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ControlDomainRankingSetsService(HikariDataSource dataSource,
|
public ControlDomainRankingSetsService(ControlRendererFactory rendererFactory,
|
||||||
ControlRendererFactory rendererFactory,
|
|
||||||
DomainRankingSetsService domainRankingSetsService) {
|
DomainRankingSetsService domainRankingSetsService) {
|
||||||
this.dataSource = dataSource;
|
|
||||||
this.rendererFactory = rendererFactory;
|
this.rendererFactory = rendererFactory;
|
||||||
this.domainRankingSetsService = domainRankingSetsService;
|
this.domainRankingSetsService = domainRankingSetsService;
|
||||||
}
|
}
|
||||||
@ -47,7 +44,6 @@ public class ControlDomainRankingSetsService {
|
|||||||
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
|
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
|
||||||
id,
|
id,
|
||||||
request.queryParams("description"),
|
request.queryParams("description"),
|
||||||
DomainRankingSetsService.DomainSetAlgorithm.valueOf(request.queryParams("algorithm")),
|
|
||||||
Integer.parseInt(request.queryParams("depth")),
|
Integer.parseInt(request.queryParams("depth")),
|
||||||
request.queryParams("definition")
|
request.queryParams("definition")
|
||||||
));
|
));
|
||||||
@ -77,7 +73,6 @@ public class ControlDomainRankingSetsService {
|
|||||||
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
|
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
|
||||||
request.queryParams("name").toUpperCase(),
|
request.queryParams("name").toUpperCase(),
|
||||||
request.queryParams("description"),
|
request.queryParams("description"),
|
||||||
DomainRankingSetsService.DomainSetAlgorithm.valueOf(request.queryParams("algorithm")),
|
|
||||||
Integer.parseInt(request.queryParams("depth")),
|
Integer.parseInt(request.queryParams("depth")),
|
||||||
request.queryParams("definition")
|
request.queryParams("definition")
|
||||||
));
|
));
|
||||||
@ -95,17 +90,6 @@ public class ControlDomainRankingSetsService {
|
|||||||
}
|
}
|
||||||
private Object rankingSetModel(Request request, Response response) throws SQLException {
|
private Object rankingSetModel(Request request, Response response) throws SQLException {
|
||||||
var model = domainRankingSetsService.get(request.params("id")).orElseThrow();
|
var model = domainRankingSetsService.get(request.params("id")).orElseThrow();
|
||||||
return Map.of("rankingSet", model,
|
return Map.of("rankingSet", model);
|
||||||
"selectedAlgo", Map.of(
|
|
||||||
"special", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.SPECIAL,
|
|
||||||
"adjacency_cheirank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_CHEIRANK,
|
|
||||||
"adjacency_pagerank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_PAGERANK,
|
|
||||||
"links_cheirank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.LINKS_CHEIRANK,
|
|
||||||
"links_pagerank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.LINKS_PAGERANK)
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16,14 +16,12 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<th>Name</th>
|
<th>Name</th>
|
||||||
<th>Description</th>
|
<th>Description</th>
|
||||||
<th>Algorithm</th>
|
|
||||||
<th>Depth</th>
|
<th>Depth</th>
|
||||||
</tr>
|
</tr>
|
||||||
{{#each rankingSets}}
|
{{#each rankingSets}}
|
||||||
<tr>
|
<tr>
|
||||||
<td><a href="/domain-ranking-sets/{{name}}">{{name}}</td></td>
|
<td><a href="/domain-ranking-sets/{{name}}">{{name}}</td></td>
|
||||||
<td>{{description}}</td>
|
<td>{{description}}</td>
|
||||||
<td>{{algorithm}}</td>
|
|
||||||
<td>{{depth}}</td>
|
<td>{{depth}}</td>
|
||||||
</tr>
|
</tr>
|
||||||
{{/each}}
|
{{/each}}
|
||||||
|
@ -21,23 +21,6 @@
|
|||||||
</div>
|
</div>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
|
||||||
<th><label for="algorithm">Algorithm</label></th>
|
|
||||||
<td>
|
|
||||||
<select id="algorithm" name="algorithm">
|
|
||||||
<option value="LINKS_PAGERANK">LINKS_PAGERANK</option>
|
|
||||||
<option value="LINKS_CHEIRANK">LINKS_CHEIRANK</option>
|
|
||||||
<option value="ADJACENCY_PAGERANK">ADJACENCY_PAGERANK</option>
|
|
||||||
<option value="ADJACENCY_CHEIRANK">ADJACENCY_CHEIRANK</option>
|
|
||||||
</select>
|
|
||||||
<div>
|
|
||||||
<small class="text-muted">
|
|
||||||
The algorithm used to rank the domains. The LINKS algorithms use the link graph, and the ADJACENCY
|
|
||||||
algorithms use the adjacency graph. CheiRank is a variant of PageRank that uses the reversed graph.
|
|
||||||
</small>
|
|
||||||
</div>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
<tr>
|
||||||
<th><label for="description">Description</label></th>
|
<th><label for="description">Description</label></th>
|
||||||
<td>
|
<td>
|
||||||
@ -61,8 +44,12 @@
|
|||||||
<textarea name="definition" id="definition" rows="10" style="width: 100%">{{definition}}</textarea>
|
<textarea name="definition" id="definition" rows="10" style="width: 100%">{{definition}}</textarea>
|
||||||
<div>
|
<div>
|
||||||
<small class="text-muted">A list of domain names, one per line, possibly globbed with SQL-style '%' wildcards.
|
<small class="text-muted">A list of domain names, one per line, possibly globbed with SQL-style '%' wildcards.
|
||||||
These are used as the origin point for the Personalized PageRank algorithm, and will be considered
|
If provided, these are used as the origin point for the Personalized PageRank algorithm, and will be considered
|
||||||
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used, as per the PageRank paper.
|
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used,
|
||||||
|
as per the PageRank paper.
|
||||||
|
<br><br>
|
||||||
|
If similarity data is available and domains are specified, the similarity data is used as basis for the ranking
|
||||||
|
calculation instead, providing a much more coherent ranking.
|
||||||
</small>
|
</small>
|
||||||
</div>
|
</div>
|
||||||
</td></tr>
|
</td></tr>
|
||||||
|
@ -22,27 +22,6 @@
|
|||||||
</div>
|
</div>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
|
||||||
<th><label for="algorithm">Algorithm</label></th>
|
|
||||||
<td>
|
|
||||||
{{#if special}}<input type="hidden" name="algorithm" value="{{algorithm}}" />{{/if}}
|
|
||||||
<select id="algorithm" name="algorithm" {{#if special}}disabled{{/if}}>
|
|
||||||
{{#with algorithm}}
|
|
||||||
<option value="SPECIAL" disabled {{#if selectedAlgo.special}}selected{{/if}}>SPECIAL</option>
|
|
||||||
<option value="LINKS_PAGERANK" {{#if selectedAlgo.links_pagerank}}selected{{/if}}>LINKS_PAGERANK</option>
|
|
||||||
<option value="LINKS_CHEIRANK" {{#if selectedAlgo.links_cheirank}}selected{{/if}}>LINKS_CHEIRANK</option>
|
|
||||||
<option value="ADJACENCY_PAGERANK" {{#if selectedAlgo.adjacency_pagerank}}selected{{/if}}>ADJACENCY_PAGERANK</option>
|
|
||||||
<option value="ADJACENCY_CHEIRANK" {{#if selectedAlgo.adjacency_cheirank}}selected{{/if}}>ADJACENCY_CHEIRANK</option>
|
|
||||||
{{/with}}
|
|
||||||
</select>
|
|
||||||
<div>
|
|
||||||
<small class="text-muted">
|
|
||||||
The algorithm used to rank the domains. The LINKS algorithms use the link graph, and the ADJACENCY
|
|
||||||
algorithms use the adjacency graph. CheiRank is a variant of PageRank that uses the reversed graph.
|
|
||||||
</small>
|
|
||||||
</div>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
<tr>
|
||||||
<th><label for="description">Description</label></th>
|
<th><label for="description">Description</label></th>
|
||||||
<td>
|
<td>
|
||||||
@ -67,8 +46,12 @@
|
|||||||
<textarea name="definition" id="definition" rows="10" style="width: 100%">{{definition}}</textarea>
|
<textarea name="definition" id="definition" rows="10" style="width: 100%">{{definition}}</textarea>
|
||||||
<div>
|
<div>
|
||||||
<small class="text-muted">A list of domain names, one per line, possibly globbed with SQL-style '%' wildcards.
|
<small class="text-muted">A list of domain names, one per line, possibly globbed with SQL-style '%' wildcards.
|
||||||
These are used as the origin point for the Personalized PageRank algorithm, and will be considered
|
If provided, these are used as the origin point for the Personalized PageRank algorithm, and will be considered
|
||||||
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used, as per the PageRank paper.
|
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used,
|
||||||
|
as per the PageRank paper.
|
||||||
|
<br><br>
|
||||||
|
If similarity data is available and domains are specified, the similarity data is used as basis for the ranking
|
||||||
|
calculation instead, providing a much more coherent ranking.
|
||||||
</small>
|
</small>
|
||||||
</div>
|
</div>
|
||||||
</td></tr>
|
</td></tr>
|
||||||
|
@ -8,25 +8,23 @@ import nu.marginalia.db.DomainRankingSetsService;
|
|||||||
import nu.marginalia.db.DomainTypes;
|
import nu.marginalia.db.DomainTypes;
|
||||||
import nu.marginalia.index.IndexServicesFactory;
|
import nu.marginalia.index.IndexServicesFactory;
|
||||||
import nu.marginalia.index.searchset.SearchSet;
|
import nu.marginalia.index.searchset.SearchSet;
|
||||||
import nu.marginalia.ranking.RankingAlgorithm;
|
import nu.marginalia.ranking.*;
|
||||||
import nu.marginalia.ranking.ReversePageRank;
|
|
||||||
import nu.marginalia.ranking.StandardPageRank;
|
|
||||||
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
|
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||||
import nu.marginalia.ranking.accumulator.RankingResultHashSetAccumulator;
|
import nu.marginalia.ranking.accumulator.RankingResultHashSetAccumulator;
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
|
||||||
import nu.marginalia.index.svc.searchset.RankingSearchSet;
|
import nu.marginalia.index.svc.searchset.RankingSearchSet;
|
||||||
import nu.marginalia.index.svc.searchset.SearchSetAny;
|
import nu.marginalia.index.svc.searchset.SearchSetAny;
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
|
||||||
import nu.marginalia.index.db.DbUpdateRanks;
|
import nu.marginalia.index.db.DbUpdateRanks;
|
||||||
|
import nu.marginalia.ranking.data.GraphSource;
|
||||||
|
import nu.marginalia.ranking.data.LinkGraphSource;
|
||||||
|
import nu.marginalia.ranking.data.SimilarityGraphSource;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
@ -34,13 +32,12 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||||||
public class IndexSearchSetsService {
|
public class IndexSearchSetsService {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final DomainTypes domainTypes;
|
private final DomainTypes domainTypes;
|
||||||
private final ServiceHeartbeat heartbeat;
|
|
||||||
private final IndexServicesFactory indexServicesFactory;
|
private final IndexServicesFactory indexServicesFactory;
|
||||||
private final ServiceEventLog eventLog;
|
private final ServiceEventLog eventLog;
|
||||||
private final DomainRankingSetsService domainRankingSetsService;
|
private final DomainRankingSetsService domainRankingSetsService;
|
||||||
private final DbUpdateRanks dbUpdateRanks;
|
private final DbUpdateRanks dbUpdateRanks;
|
||||||
private final RankingDomainFetcher similarityDomains;
|
private final GraphSource similarityDomains;
|
||||||
private final RankingDomainFetcher linksDomains;
|
private final GraphSource linksDomains;
|
||||||
|
|
||||||
private final ConcurrentHashMap<String, SearchSet> rankingSets = new ConcurrentHashMap<>();
|
private final ConcurrentHashMap<String, SearchSet> rankingSets = new ConcurrentHashMap<>();
|
||||||
// Below are binary indices that are used to constrain a search
|
// Below are binary indices that are used to constrain a search
|
||||||
@ -55,23 +52,21 @@ public class IndexSearchSetsService {
|
|||||||
@Inject
|
@Inject
|
||||||
public IndexSearchSetsService(DomainTypes domainTypes,
|
public IndexSearchSetsService(DomainTypes domainTypes,
|
||||||
ServiceConfiguration serviceConfiguration,
|
ServiceConfiguration serviceConfiguration,
|
||||||
ServiceHeartbeat heartbeat,
|
LinkGraphSource rankingDomains,
|
||||||
RankingDomainFetcher rankingDomains,
|
SimilarityGraphSource similarityDomains,
|
||||||
RankingDomainFetcherForSimilarityData similarityDomains,
|
|
||||||
IndexServicesFactory indexServicesFactory,
|
IndexServicesFactory indexServicesFactory,
|
||||||
ServiceEventLog eventLog,
|
ServiceEventLog eventLog,
|
||||||
DomainRankingSetsService domainRankingSetsService,
|
DomainRankingSetsService domainRankingSetsService,
|
||||||
DbUpdateRanks dbUpdateRanks) throws IOException {
|
DbUpdateRanks dbUpdateRanks) throws IOException {
|
||||||
this.nodeId = serviceConfiguration.node();
|
this.nodeId = serviceConfiguration.node();
|
||||||
this.domainTypes = domainTypes;
|
this.domainTypes = domainTypes;
|
||||||
this.heartbeat = heartbeat;
|
|
||||||
this.indexServicesFactory = indexServicesFactory;
|
this.indexServicesFactory = indexServicesFactory;
|
||||||
this.eventLog = eventLog;
|
this.eventLog = eventLog;
|
||||||
this.domainRankingSetsService = domainRankingSetsService;
|
this.domainRankingSetsService = domainRankingSetsService;
|
||||||
|
|
||||||
this.dbUpdateRanks = dbUpdateRanks;
|
this.dbUpdateRanks = dbUpdateRanks;
|
||||||
|
|
||||||
if (similarityDomains.hasData()) {
|
if (similarityDomains.isAvailable()) {
|
||||||
this.similarityDomains = similarityDomains;
|
this.similarityDomains = similarityDomains;
|
||||||
this.linksDomains = rankingDomains;
|
this.linksDomains = rankingDomains;
|
||||||
}
|
}
|
||||||
@ -126,13 +121,13 @@ public class IndexSearchSetsService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (DomainRankingSetsService.DomainSetAlgorithm.SPECIAL.equals(rankingSet.algorithm())) {
|
if (rankingSet.isSpecial()) {
|
||||||
switch (rankingSet.name()) {
|
switch (rankingSet.name()) {
|
||||||
case "BLOGS" -> recalculateBlogsSet(rankingSet);
|
case "BLOGS" -> recalculateBlogsSet(rankingSet);
|
||||||
case "NONE" -> {} // No-op
|
case "NONE" -> {} // No-op
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
recalculateNornal(rankingSet);
|
recalculateNormal(rankingSet);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
@ -142,18 +137,18 @@ public class IndexSearchSetsService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void recalculateNornal(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
private void recalculateNormal(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
||||||
String[] domains = rankingSet.domains();
|
List<String> domains = List.of(rankingSet.domains());
|
||||||
|
|
||||||
RankingAlgorithm rankingAlgorithm = switch (rankingSet.algorithm()) {
|
GraphSource source;
|
||||||
case LINKS_PAGERANK -> new StandardPageRank(linksDomains, domains);
|
|
||||||
case LINKS_CHEIRANK -> new ReversePageRank(linksDomains, domains);
|
|
||||||
case ADJACENCY_PAGERANK -> new StandardPageRank(similarityDomains, domains);
|
|
||||||
case ADJACENCY_CHEIRANK -> new ReversePageRank(similarityDomains, domains);
|
|
||||||
default -> throw new IllegalStateException("Unexpected value: " + rankingSet.algorithm());
|
|
||||||
};
|
|
||||||
|
|
||||||
var data = rankingAlgorithm.pageRankWithPeripheralNodes(rankingSet.depth(), RankingResultHashSetAccumulator::new);
|
// Similarity ranking does not behave well with an empty set of domains
|
||||||
|
if (domains.isEmpty()) source = linksDomains;
|
||||||
|
else source = similarityDomains;
|
||||||
|
|
||||||
|
var data = PageRankDomainRanker
|
||||||
|
.forDomainNames(source, domains)
|
||||||
|
.calculate(rankingSet.depth(), RankingResultHashSetAccumulator::new);
|
||||||
|
|
||||||
var set = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), data);
|
var set = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), data);
|
||||||
rankingSets.put(rankingSet.name(), set);
|
rankingSets.put(rankingSet.name(), set);
|
||||||
@ -185,9 +180,21 @@ public class IndexSearchSetsService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void updateDomainRankings(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
private void updateDomainRankings(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
||||||
|
List<String> domains = List.of(rankingSet.domains());
|
||||||
|
|
||||||
var spr = new StandardPageRank(similarityDomains, rankingSet.domains());
|
final GraphSource source;
|
||||||
var ranks = spr.pageRankWithPeripheralNodes(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
|
|
||||||
|
if (domains.isEmpty()) {
|
||||||
|
// Similarity ranking does not behave well with an empty set of domains
|
||||||
|
source = linksDomains;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
source = similarityDomains;
|
||||||
|
}
|
||||||
|
|
||||||
|
var ranks = PageRankDomainRanker
|
||||||
|
.forDomainNames(source, domains)
|
||||||
|
.calculate(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
|
||||||
|
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
domainRankings = new DomainRankings(ranks);
|
domainRankings = new DomainRankings(ranks);
|
||||||
|
Loading…
Reference in New Issue
Block a user