(domain-ranking) Integrate new ranking logic

The change deprecates the 'algorithm' field from the domain ranking set configuration.  Instead, the algorithm will be chosen based on whether influence domains are provided, and whether similarity data is present.
This commit is contained in:
Viktor Lofgren 2024-02-16 20:22:01 +01:00
parent 64acdb5f2a
commit 9ec262ae00
13 changed files with 235 additions and 83 deletions

View file

@ -25,7 +25,7 @@ public class DomainRankingSetsService {
public Optional<DomainRankingSet> get(String name) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION
SELECT NAME, DESCRIPTION, DEPTH, DEFINITION
FROM CONF_DOMAIN_RANKING_SET
WHERE NAME = ?
""")) {
@ -39,7 +39,6 @@ public class DomainRankingSetsService {
return Optional.of(new DomainRankingSet(
rs.getString("NAME"),
rs.getString("DESCRIPTION"),
DomainSetAlgorithm.valueOf(rs.getString("ALGORITHM")),
rs.getInt("DEPTH"),
rs.getString("DEFINITION")
));
@ -53,15 +52,14 @@ public class DomainRankingSetsService {
public void upsert(DomainRankingSet domainRankingSet) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
REPLACE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION)
VALUES (?, ?, ?, ?, ?)
REPLACE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, DEPTH, DEFINITION)
VALUES (?, ?, ?, ?)
"""))
{
stmt.setString(1, domainRankingSet.name());
stmt.setString(2, domainRankingSet.description());
stmt.setString(3, domainRankingSet.algorithm().name());
stmt.setInt(4, domainRankingSet.depth());
stmt.setString(5, domainRankingSet.definition());
stmt.setInt(3, domainRankingSet.depth());
stmt.setString(4, domainRankingSet.definition());
stmt.executeUpdate();
if (!conn.getAutoCommit())
@ -94,7 +92,7 @@ public class DomainRankingSetsService {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION
SELECT NAME, DESCRIPTION, DEPTH, DEFINITION
FROM CONF_DOMAIN_RANKING_SET
""")) {
var rs = stmt.executeQuery();
@ -105,7 +103,6 @@ public class DomainRankingSetsService {
new DomainRankingSet(
rs.getString("NAME"),
rs.getString("DESCRIPTION"),
DomainSetAlgorithm.valueOf(rs.getString("ALGORITHM")),
rs.getInt("DEPTH"),
rs.getString("DEFINITION"))
);
@ -118,31 +115,17 @@ public class DomainRankingSetsService {
}
}
public enum DomainSetAlgorithm {
/** Use link graph, do a pagerank */
LINKS_PAGERANK,
/** Use link graph, do a cheirank */
LINKS_CHEIRANK,
/** Use adjacency graph, do a pagerank */
ADJACENCY_PAGERANK,
/** Use adjacency graph, do a cheirank */
ADJACENCY_CHEIRANK,
/** For reserved names. Use special algorithm, function of name */
SPECIAL
};
/** Defines a domain ranking set, parameters for the ranking algorithms.
*
* @param name Key and name of the set
* @param description Human-readable description
* @param algorithm Algorithm to use
* @param depth Depth of the algorithm
* @param definition Definition of the set, typically a list of domains or globs for domain-names
* */
@With
public record DomainRankingSet(String name,
String description,
DomainSetAlgorithm algorithm,
int depth,
String definition)
{
@ -159,7 +142,7 @@ public class DomainRankingSetsService {
}
public boolean isSpecial() {
return algorithm() == DomainSetAlgorithm.SPECIAL;
return name().equals("BLOGS") || name().equals("NONE") || name().equals("RANK");
}
}

View file

@ -0,0 +1 @@
ALTER TABLE CONF_DOMAIN_RANKING_SET DROP COLUMN ALGORITHM;

View file

@ -56,14 +56,12 @@ class DomainRankingSetsServiceTest {
var newValue = new DomainRankingSetsService.DomainRankingSet(
"test",
"Test domain set",
DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_CHEIRANK,
10,
"test\\.nu"
);
var newValue2 = new DomainRankingSetsService.DomainRankingSet(
"test2",
"Test domain set 2",
DomainRankingSetsService.DomainSetAlgorithm.LINKS_PAGERANK,
20,
"test\\.nu 2"
);

View file

@ -32,13 +32,14 @@ dependencies {
implementation libs.hll
testImplementation project(':code:libraries:array')
testImplementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
minHeapSize = "128m" // initial heap size
maxHeapSize = "20G" // maximum heap size
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
}

View file

@ -25,6 +25,12 @@ public class PageRankDomainRanker implements RankingAlgorithm {
this.graph = source.getGraph();
}
public static PageRankDomainRanker forDomainNames(GraphSource source,
List<String> influenceSet)
{
return new PageRankDomainRanker(source, source.domainIds(influenceSet));
}
@Override
public <T> T calculate(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
VertexScoringAlgorithm<Integer, Double> pageRank;

View file

@ -4,8 +4,7 @@ import com.zaxxer.hikari.HikariDataSource;
import org.jgrapht.Graph;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.*;
public abstract class AbstractGraphSource implements GraphSource {
protected final HikariDataSource dataSource;
@ -39,17 +38,23 @@ public abstract class AbstractGraphSource implements GraphSource {
var stmt = conn.prepareStatement("""
SELECT ID
FROM EC_DOMAIN
WHERE DOMAIN_NAME IN (?)
WHERE DOMAIN_NAME LIKE ?
"""))
{
stmt.setArray(1, conn.createArrayOf("VARCHAR", domainNameList.toArray()));
try (var rs = stmt.executeQuery()) {
var result = new ArrayList<Integer>();
while (rs.next()) {
result.add(rs.getInt(1));
Set<Integer> retSet = new HashSet<>();
for (String domainName : domainNameList) {
stmt.setString(1, domainName);
try (var rs = stmt.executeQuery()) {
while (rs.next()) {
retSet.add(rs.getInt(1));
}
}
return result;
}
var ret = new ArrayList<>(retSet);
ret.sort(Comparator.naturalOrder());
return ret;
}
catch (SQLException ex) {
throw new RuntimeException(ex);

View file

@ -11,5 +11,13 @@ public interface GraphSource {
/** Construct the graph */
Graph<Integer, ?> getGraph();
/** Return a list of domain ids for the given domain names.
* The function will also accept SQL-style wildcards,
* e.g. "%marginalia.nu" will match "marginalia.nu" and "memex.marginalia.nu".
* <p></p>
* If multiple wildcards are provided, and overlapping domains are matched,
* they will be included only once. The returned list will be sorted in
* numerical order of the domain IDs.
*/
List<Integer> domainIds(List<String> domainNameList);
}

View file

@ -1,11 +1,14 @@
package nu.marginalia.ranking;
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.util.List;
// Test the ranking algorithm with prod data.
// Test the ranking algorithm with prod data. Will not run if the data is not available.
// It's not feasible to include the data in the git repo, as it's ~6 GB of data.
@Disabled
class RankingAlgorithmWithRealDataTest {
@Test

View file

@ -0,0 +1,161 @@
package nu.marginalia.ranking;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.query.client.QueryClient;
import nu.marginalia.ranking.data.InvertedLinkGraphSource;
import nu.marginalia.ranking.data.LinkGraphSource;
import nu.marginalia.ranking.data.SimilarityGraphSource;
import nu.marginalia.test.TestMigrationLoader;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultWeightedEdge;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.parallel.Execution;
import org.mockito.Mockito;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.sql.SQLException;
import java.util.List;
import static org.junit.Assert.assertEquals;
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
import static org.mockito.Mockito.when;
@Tag("slow")
@Testcontainers
@Execution(SAME_THREAD)
public class RankingAlgorithmsContainerTest {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withNetworkAliases("mariadb");
static HikariDataSource dataSource;
QueryClient queryClient;
QueryClient.AllLinks allLinks;
@BeforeAll
public static void setup() {
HikariConfig config = new HikariConfig();
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
config.setUsername("wmsa");
config.setPassword("wmsa");
dataSource = new HikariDataSource(config);
TestMigrationLoader.flywayMigration(dataSource);
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement()) {
stmt.executeUpdate("""
INSERT INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
VALUES ('memex.marginalia.nu', 'marginalia.nu', 1),
('search.marginalia.nu', 'marginalia.nu', 1),
('encyclopedia.marginalia.nu', 'marginalia.nu', 1),
('marginalia.nu', 'marginalia.nu', 1);
""");
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
@BeforeEach
public void setupQueryClient() {
queryClient = Mockito.mock(QueryClient.class);
allLinks = new QueryClient.AllLinks();
when(queryClient.getAllDomainLinks()).thenReturn(allLinks);
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement()) {
stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_NEIGHBORS_2");
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
private void addSimilarity(int source, int dest, double similarity) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
INSERT INTO EC_DOMAIN_NEIGHBORS_2(DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS)
VALUES (?, ?, ?)
""")) {
stmt.setInt(1, source);
stmt.setInt(2, dest);
stmt.setDouble(3, similarity);
stmt.executeUpdate();
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
@Test
public void testGetDomains() {
// should all be the same, doesn't matter which one we use
var source = new LinkGraphSource(dataSource, queryClient);
Assertions.assertEquals(List.of(1),
source.domainIds(List.of("memex.marginalia.nu")));
// Verify globbing
Assertions.assertEquals(List.of(1,2,3),
source.domainIds(List.of("%.marginalia.nu")));
}
@Test
public void testLinkGraphSource() {
allLinks.add(1, 3);
var graph = new LinkGraphSource(dataSource, queryClient).getGraph();
Assertions.assertTrue(graph.containsVertex(1));
Assertions.assertTrue(graph.containsVertex(2));
Assertions.assertTrue(graph.containsVertex(3));
Assertions.assertTrue(graph.containsEdge(1, 3));
Assertions.assertFalse(graph.containsEdge(3, 1));
Assertions.assertFalse(graph.containsEdge(2, 3));
Assertions.assertFalse(graph.containsEdge(3, 2));
}
@Test
public void testInvertedLinkGraphSource() {
allLinks.add(1, 3);
var graph = new InvertedLinkGraphSource(dataSource, queryClient).getGraph();
Assertions.assertTrue(graph.containsVertex(1));
Assertions.assertTrue(graph.containsVertex(2));
Assertions.assertTrue(graph.containsVertex(3));
Assertions.assertTrue(graph.containsEdge(3, 1));
Assertions.assertFalse(graph.containsEdge(1, 3));
Assertions.assertFalse(graph.containsEdge(2, 3));
Assertions.assertFalse(graph.containsEdge(3, 2));
}
@Test
@SuppressWarnings("unchecked")
public void testSimilarityGraphSource() {
addSimilarity(1, 3, 0.5);
var graph = (Graph<Integer, DefaultWeightedEdge>) new SimilarityGraphSource(dataSource).getGraph();
Assertions.assertTrue(graph.containsVertex(1));
Assertions.assertTrue(graph.containsVertex(2));
Assertions.assertTrue(graph.containsVertex(3));
Assertions.assertTrue(graph.containsEdge(3, 1));
Assertions.assertTrue(graph.containsEdge(1, 3));
Assertions.assertEquals(graph.getEdgeWeight(graph.getEdge(1, 3)), 0.5, 0.0001);
Assertions.assertFalse(graph.containsEdge(1, 2));
Assertions.assertFalse(graph.containsEdge(2, 3));
}
}

View file

@ -15,15 +15,12 @@ import java.sql.SQLException;
import java.util.Map;
public class ControlDomainRankingSetsService {
private final HikariDataSource dataSource;
private final ControlRendererFactory rendererFactory;
private final DomainRankingSetsService domainRankingSetsService;
@Inject
public ControlDomainRankingSetsService(HikariDataSource dataSource,
ControlRendererFactory rendererFactory,
public ControlDomainRankingSetsService(ControlRendererFactory rendererFactory,
DomainRankingSetsService domainRankingSetsService) {
this.dataSource = dataSource;
this.rendererFactory = rendererFactory;
this.domainRankingSetsService = domainRankingSetsService;
}
@ -47,7 +44,6 @@ public class ControlDomainRankingSetsService {
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
id,
request.queryParams("description"),
DomainRankingSetsService.DomainSetAlgorithm.valueOf(request.queryParams("algorithm")),
Integer.parseInt(request.queryParams("depth")),
request.queryParams("definition")
));
@ -77,7 +73,6 @@ public class ControlDomainRankingSetsService {
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
request.queryParams("name").toUpperCase(),
request.queryParams("description"),
DomainRankingSetsService.DomainSetAlgorithm.valueOf(request.queryParams("algorithm")),
Integer.parseInt(request.queryParams("depth")),
request.queryParams("definition")
));
@ -95,17 +90,6 @@ public class ControlDomainRankingSetsService {
}
private Object rankingSetModel(Request request, Response response) throws SQLException {
var model = domainRankingSetsService.get(request.params("id")).orElseThrow();
return Map.of("rankingSet", model,
"selectedAlgo", Map.of(
"special", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.SPECIAL,
"adjacency_cheirank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_CHEIRANK,
"adjacency_pagerank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_PAGERANK,
"links_cheirank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.LINKS_CHEIRANK,
"links_pagerank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.LINKS_PAGERANK)
);
return Map.of("rankingSet", model);
}
}

View file

@ -16,14 +16,12 @@
<tr>
<th>Name</th>
<th>Description</th>
<th>Algorithm</th>
<th>Depth</th>
</tr>
{{#each rankingSets}}
<tr>
<td><a href="/domain-ranking-sets/{{name}}">{{name}}</td></td>
<td>{{description}}</td>
<td>{{algorithm}}</td>
<td>{{depth}}</td>
</tr>
{{/each}}

View file

@ -26,15 +26,6 @@
<th><label for="algorithm">Algorithm</label></th>
<td>
{{#if special}}<input type="hidden" name="algorithm" value="{{algorithm}}" />{{/if}}
<select id="algorithm" name="algorithm" {{#if special}}disabled{{/if}}>
{{#with algorithm}}
<option value="SPECIAL" disabled {{#if selectedAlgo.special}}selected{{/if}}>SPECIAL</option>
<option value="LINKS_PAGERANK" {{#if selectedAlgo.links_pagerank}}selected{{/if}}>LINKS_PAGERANK</option>
<option value="LINKS_CHEIRANK" {{#if selectedAlgo.links_cheirank}}selected{{/if}}>LINKS_CHEIRANK</option>
<option value="ADJACENCY_PAGERANK" {{#if selectedAlgo.adjacency_pagerank}}selected{{/if}}>ADJACENCY_PAGERANK</option>
<option value="ADJACENCY_CHEIRANK" {{#if selectedAlgo.adjacency_cheirank}}selected{{/if}}>ADJACENCY_CHEIRANK</option>
{{/with}}
</select>
<div>
<small class="text-muted">
The algorithm used to rank the domains. The LINKS algorithms use the link graph, and the ADJACENCY
@ -68,7 +59,8 @@
<div>
<small class="text-muted">A list of domain names, one per line, possibly globbed with SQL-style '%' wildcards.
These are used as the origin point for the Personalized PageRank algorithm, and will be considered
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used, as per the PageRank paper.
the central points of the link or adjacency graph. If no domains are specified,
the entire domain space is used, as per the PageRank paper.
</small>
</div>
</td></tr>

View file

@ -18,7 +18,6 @@ import nu.marginalia.ranking.data.GraphSource;
import nu.marginalia.ranking.data.LinkGraphSource;
import nu.marginalia.ranking.data.SimilarityGraphSource;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -122,13 +121,13 @@ public class IndexSearchSetsService {
}
try {
if (DomainRankingSetsService.DomainSetAlgorithm.SPECIAL.equals(rankingSet.algorithm())) {
if (rankingSet.isSpecial()) {
switch (rankingSet.name()) {
case "BLOGS" -> recalculateBlogsSet(rankingSet);
case "NONE" -> {} // No-op
}
} else {
recalculateNornal(rankingSet);
recalculateNormal(rankingSet);
}
}
catch (Exception ex) {
@ -138,16 +137,17 @@ public class IndexSearchSetsService {
}
}
private void recalculateNornal(DomainRankingSetsService.DomainRankingSet rankingSet) {
String[] domains = rankingSet.domains();
private void recalculateNormal(DomainRankingSetsService.DomainRankingSet rankingSet) {
List<String> domains = List.of(rankingSet.domains());
GraphSource graphSource = switch (rankingSet.algorithm()) {
case LINKS_PAGERANK, LINKS_CHEIRANK -> linksDomains;
case ADJACENCY_PAGERANK, ADJACENCY_CHEIRANK -> similarityDomains;
default -> throw new IllegalStateException("Unexpected value: " + rankingSet.algorithm());
};
GraphSource source;
var data = new PageRankDomainRanker(linksDomains, linksDomains.domainIds(List.of(domains)))
// Similarity ranking does not behave well with an empty set of domains
if (domains.isEmpty()) source = linksDomains;
else source = similarityDomains;
var data = PageRankDomainRanker
.forDomainNames(source, domains)
.calculate(rankingSet.depth(), RankingResultHashSetAccumulator::new);
var set = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), data);
@ -180,9 +180,21 @@ public class IndexSearchSetsService {
}
private void updateDomainRankings(DomainRankingSetsService.DomainRankingSet rankingSet) {
List<String> domains = List.of(rankingSet.domains());
var ranks = new PageRankDomainRanker(similarityDomains, similarityDomains.domainIds(List.of(rankingSet.domains())))
.calculate(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
final GraphSource source;
if (domains.isEmpty()) {
// Similarity ranking does not behave well with an empty set of domains
source = linksDomains;
}
else {
source = similarityDomains;
}
var ranks = PageRankDomainRanker
.forDomainNames(source, domains)
.calculate(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
synchronized (this) {
domainRankings = new DomainRankings(ranks);