Modified the ranking algorithm to be able to pagerank with similarity data instead of the link graph.

This commit is contained in:
Viktor Lofgren 2023-02-07 22:13:25 +01:00
parent 04f905f3a1
commit e963ecb4ae
11 changed files with 240 additions and 20 deletions

View File

@ -5,6 +5,8 @@ import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import it.unimi.dsi.fastutil.ints.IntArrays;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -83,6 +85,10 @@ public abstract class RankingAlgorithm {
logger.info("Origin Domains: {}", originDomainIds.size());
}
public RankingDomainData getDomainData(int id) {
return domainsById.get(id);
}
public void addPeripheralNodes() {
int newNodesIdxCutoff = domainIdToIndex.size();
@ -200,9 +206,9 @@ public abstract class RankingAlgorithm {
public void setMaxKnownUrls(int maxKnownUrls) {
this.maxKnownUrls = maxKnownUrls;
}
public class RankVector {
private final double[] rank;
public RankVector(double defaultValue) {
rank = new double[domainIndexToId.size()];
if (defaultValue != 0.) {
@ -259,7 +265,6 @@ public abstract class RankingAlgorithm {
return accumulator;
}
private static int[] sortOrder(double[] values) {
int[] ret = new int[values.length];
@ -268,6 +273,7 @@ public abstract class RankingAlgorithm {
return ret;
}
}
}

View File

@ -1,6 +1,8 @@
package nu.marginalia.wmsa.edge.index.ranking;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
public class ReversePageRank extends RankingAlgorithm {

View File

@ -1,6 +1,8 @@
package nu.marginalia.wmsa.edge.index.ranking;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
public class StandardPageRank extends RankingAlgorithm {
public StandardPageRank(RankingDomainFetcher domains, String... origins) {

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.ranking;
package nu.marginalia.wmsa.edge.index.ranking.data;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -10,7 +10,7 @@ public class RankingDomainData {
public final int id;
public final String name;
private int alias;
private EdgeDomainIndexingState state;
public EdgeDomainIndexingState state;
public final int knownUrls;
public int resolveAlias() {

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.ranking;
package nu.marginalia.wmsa.edge.index.ranking.data;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
@ -12,11 +12,11 @@ import java.util.function.Consumer;
import java.util.function.IntConsumer;
public class RankingDomainFetcher {
private final HikariDataSource dataSource;
private final EdgeDomainBlacklistImpl blacklist;
private final Logger logger = LoggerFactory.getLogger(getClass());
protected final HikariDataSource dataSource;
protected final EdgeDomainBlacklistImpl blacklist;
protected final Logger logger = LoggerFactory.getLogger(getClass());
private final boolean getNames = false;
protected boolean getNames = false;
@Inject
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
@ -24,6 +24,10 @@ public class RankingDomainFetcher {
this.blacklist = blacklist;
}
public void retainNames() {
this.getNames = true;
}
public void getDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
@ -49,14 +53,19 @@ public class RankingDomainFetcher {
getDomains(query, consumer);
}
private void getDomains(String query, Consumer<RankingDomainData> consumer) {
protected void getDomains(String query, Consumer<RankingDomainData> consumer) {
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
if (!blacklist.isBlacklisted(id)) {
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5)));
consumer.accept(
new RankingDomainData(id,
rsp.getString(2),
rsp.getInt(3),
EdgeDomainIndexingState.valueOf(rsp.getString(4)),
rsp.getInt(5)));
}
}
}

View File

@ -0,0 +1,63 @@
package nu.marginalia.wmsa.edge.index.ranking.data;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import java.sql.SQLException;
import java.util.function.Consumer;
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
super(dataSource, blacklist);
}
public void eachDomainLink(DomainLinkConsumer consumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2"))
{
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
// these "links" are bidi
consumer.accept(src, dst);
consumer.accept(dst, src);
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domain links", ex);
}
}
public void getDomains(Consumer<RankingDomainData> consumer) {
// String query =
// """
// SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
// FROM EC_DOMAIN
// LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
// INNER JOIN EC_DOMAIN_LINK ON DEST_DOMAIN_ID=EC_DOMAIN.ID
// WHERE SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID
// GROUP BY EC_DOMAIN.ID
// HAVING COUNT(SOURCE_DOMAIN_ID)>5
// """;
String query =
"""
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
GROUP BY EC_DOMAIN.ID
""";
getDomains(query, consumer);
}
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
}
}

View File

@ -0,0 +1,71 @@
package nu.marginalia.wmsa.edge.index.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.concurrent.LinkedBlockingQueue;
public class CreateBrowseDomainRanksTool {
private static final Logger logger = LoggerFactory.getLogger(CreateBrowseDomainRanksTool.class);
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true;
@SneakyThrows
public static void main(String... args) {
Driver driver = new Driver();
var conn = new DatabaseModule().provideConnection();
long start = System.currentTimeMillis();
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
logger.info("Ranking");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new StandardPageRank(domains, args);
uploader.start();
var rankData = rpr.pageRankWithPeripheralNodes(1000, RankingResultListAccumulator::new);
rankData.forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
e.printStackTrace();
}
return true;
});
long end = System.currentTimeMillis();
running = false;
uploader.join();
logger.info("Done in {}", (end - start)/1000.0);
}
public static void uploadThread(HikariDataSource dataSource) {
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_RANDOM_DOMAINS(DOMAIN_SET, DOMAIN_ID) VALUES (3, ?)")) {
while (running || (!running && !uploadQueue.isEmpty())) {
var job = uploadQueue.take();
stmt.setInt(1, job);
stmt.executeUpdate();
}
}
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();
}
}
}

View File

@ -11,8 +11,8 @@ import it.unimi.dsi.fastutil.ints.IntComparator;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm;
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainData;
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import org.jetbrains.annotations.NotNull;

View File

@ -0,0 +1,67 @@
package nu.marginalia.wmsa.edge.index.ranking.tool;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
public class PrintDomainRanksTool {
private static final Logger logger = LoggerFactory.getLogger(PrintDomainRanksTool.class);
private volatile static int rankMax;
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true;
@SneakyThrows
public static void main(String... args) {
Driver driver = new Driver();
var conn = new DatabaseModule().provideConnection();
long start = System.currentTimeMillis();
logger.info("Ranking");
var ds = new DatabaseModule().provideConnection();
RankingDomainFetcher domains;
if (Boolean.getBoolean("use-link-data")) {
domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
domains.retainNames();
}
else {
domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
domains.retainNames();
}
var rpr = new StandardPageRank(domains, args);
rankMax = rpr.size();
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
AtomicInteger cnt = new AtomicInteger();
rankData.forEach(i -> {
var data = rpr.getDomainData(i);
System.out.printf("%d %s %s\n", cnt.getAndIncrement(), data.name, data.state);
return true;
});
long end = System.currentTimeMillis();
running = false;
logger.info("Done in {}", (end - start)/1000.0);
}
}

View File

@ -2,11 +2,11 @@ package nu.marginalia.wmsa.edge.index.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -14,9 +14,9 @@ import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.concurrent.LinkedBlockingQueue;
public class UpdateDomainRanksTool2 {
public class UpdateDomainRanksTool {
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
private volatile static int rankMax;
@ -33,8 +33,8 @@ public class UpdateDomainRanksTool2 {
logger.info("Ranking");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new ReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com");
rankMax = rpr.size();
uploader.start();

View File

@ -5,7 +5,7 @@ import com.google.inject.Singleton;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;