Modified the ranking algorithm to be able to pagerank with similarity data instead of the link graph.
This commit is contained in:
parent
04f905f3a1
commit
e963ecb4ae
@ -5,6 +5,8 @@ import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -83,6 +85,10 @@ public abstract class RankingAlgorithm {
|
||||
logger.info("Origin Domains: {}", originDomainIds.size());
|
||||
}
|
||||
|
||||
public RankingDomainData getDomainData(int id) {
|
||||
return domainsById.get(id);
|
||||
}
|
||||
|
||||
public void addPeripheralNodes() {
|
||||
|
||||
int newNodesIdxCutoff = domainIdToIndex.size();
|
||||
@ -200,9 +206,9 @@ public abstract class RankingAlgorithm {
|
||||
public void setMaxKnownUrls(int maxKnownUrls) {
|
||||
this.maxKnownUrls = maxKnownUrls;
|
||||
}
|
||||
|
||||
public class RankVector {
|
||||
private final double[] rank;
|
||||
|
||||
public RankVector(double defaultValue) {
|
||||
rank = new double[domainIndexToId.size()];
|
||||
if (defaultValue != 0.) {
|
||||
@ -259,7 +265,6 @@ public abstract class RankingAlgorithm {
|
||||
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
private static int[] sortOrder(double[] values) {
|
||||
|
||||
int[] ret = new int[values.length];
|
||||
@ -268,6 +273,7 @@ public abstract class RankingAlgorithm {
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,8 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
|
||||
public class ReversePageRank extends RankingAlgorithm {
|
||||
|
||||
|
||||
|
@ -1,6 +1,8 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
|
||||
public class StandardPageRank extends RankingAlgorithm {
|
||||
|
||||
public StandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -10,7 +10,7 @@ public class RankingDomainData {
|
||||
public final int id;
|
||||
public final String name;
|
||||
private int alias;
|
||||
private EdgeDomainIndexingState state;
|
||||
public EdgeDomainIndexingState state;
|
||||
public final int knownUrls;
|
||||
|
||||
public int resolveAlias() {
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
@ -12,11 +12,11 @@ import java.util.function.Consumer;
|
||||
import java.util.function.IntConsumer;
|
||||
|
||||
public class RankingDomainFetcher {
|
||||
private final HikariDataSource dataSource;
|
||||
private final EdgeDomainBlacklistImpl blacklist;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
protected final HikariDataSource dataSource;
|
||||
protected final EdgeDomainBlacklistImpl blacklist;
|
||||
protected final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final boolean getNames = false;
|
||||
protected boolean getNames = false;
|
||||
|
||||
@Inject
|
||||
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
||||
@ -24,6 +24,10 @@ public class RankingDomainFetcher {
|
||||
this.blacklist = blacklist;
|
||||
}
|
||||
|
||||
public void retainNames() {
|
||||
this.getNames = true;
|
||||
}
|
||||
|
||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
||||
String query;
|
||||
if (getNames) {
|
||||
@ -49,14 +53,19 @@ public class RankingDomainFetcher {
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
private void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
||||
protected void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
||||
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5)));
|
||||
consumer.accept(
|
||||
new RankingDomainData(id,
|
||||
rsp.getString(2),
|
||||
rsp.getInt(3),
|
||||
EdgeDomainIndexingState.valueOf(rsp.getString(4)),
|
||||
rsp.getInt(5)));
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,63 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
|
||||
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
||||
super(dataSource, blacklist);
|
||||
}
|
||||
|
||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2"))
|
||||
{
|
||||
stmt.setFetchSize(10000);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
int src = rsp.getInt(1);
|
||||
int dst = rsp.getInt(2);
|
||||
|
||||
// these "links" are bidi
|
||||
consumer.accept(src, dst);
|
||||
consumer.accept(dst, src);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch domain links", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
||||
// String query =
|
||||
// """
|
||||
// SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||
// FROM EC_DOMAIN
|
||||
// LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
// INNER JOIN EC_DOMAIN_LINK ON DEST_DOMAIN_ID=EC_DOMAIN.ID
|
||||
// WHERE SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID
|
||||
// GROUP BY EC_DOMAIN.ID
|
||||
// HAVING COUNT(SOURCE_DOMAIN_ID)>5
|
||||
// """;
|
||||
|
||||
String query =
|
||||
"""
|
||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
|
||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,71 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
public class CreateBrowseDomainRanksTool {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(CreateBrowseDomainRanksTool.class);
|
||||
|
||||
|
||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||
volatile static boolean running = true;
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) {
|
||||
Driver driver = new Driver();
|
||||
var conn = new DatabaseModule().provideConnection();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
||||
|
||||
logger.info("Ranking");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new StandardPageRank(domains, args);
|
||||
|
||||
uploader.start();
|
||||
|
||||
var rankData = rpr.pageRankWithPeripheralNodes(1000, RankingResultListAccumulator::new);
|
||||
|
||||
rankData.forEach(i -> {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
running = false;
|
||||
uploader.join();
|
||||
|
||||
logger.info("Done in {}", (end - start)/1000.0);
|
||||
}
|
||||
|
||||
public static void uploadThread(HikariDataSource dataSource) {
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_RANDOM_DOMAINS(DOMAIN_SET, DOMAIN_ID) VALUES (3, ?)")) {
|
||||
while (running || (!running && !uploadQueue.isEmpty())) {
|
||||
var job = uploadQueue.take();
|
||||
stmt.setInt(1, job);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
} catch (SQLException | InterruptedException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
@ -11,8 +11,8 @@ import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainData;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
@ -0,0 +1,67 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
public class PrintDomainRanksTool {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PrintDomainRanksTool.class);
|
||||
|
||||
private volatile static int rankMax;
|
||||
|
||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||
volatile static boolean running = true;
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) {
|
||||
Driver driver = new Driver();
|
||||
var conn = new DatabaseModule().provideConnection();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
logger.info("Ranking");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
|
||||
RankingDomainFetcher domains;
|
||||
if (Boolean.getBoolean("use-link-data")) {
|
||||
domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
domains.retainNames();
|
||||
}
|
||||
else {
|
||||
domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
domains.retainNames();
|
||||
}
|
||||
|
||||
var rpr = new StandardPageRank(domains, args);
|
||||
|
||||
rankMax = rpr.size();
|
||||
|
||||
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
|
||||
|
||||
AtomicInteger cnt = new AtomicInteger();
|
||||
rankData.forEach(i -> {
|
||||
|
||||
var data = rpr.getDomainData(i);
|
||||
|
||||
System.out.printf("%d %s %s\n", cnt.getAndIncrement(), data.name, data.state);
|
||||
return true;
|
||||
});
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
running = false;
|
||||
|
||||
logger.info("Done in {}", (end - start)/1000.0);
|
||||
}
|
||||
|
||||
}
|
@ -2,11 +2,11 @@ package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -14,9 +14,9 @@ import org.slf4j.LoggerFactory;
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
public class UpdateDomainRanksTool2 {
|
||||
public class UpdateDomainRanksTool {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
|
||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
|
||||
|
||||
private volatile static int rankMax;
|
||||
|
||||
@ -33,8 +33,8 @@ public class UpdateDomainRanksTool2 {
|
||||
|
||||
logger.info("Ranking");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new ReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com");
|
||||
|
||||
rankMax = rpr.size();
|
||||
uploader.start();
|
@ -5,7 +5,7 @@ import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
|
Loading…
Reference in New Issue
Block a user