Merge pull request #73 from MarginaliaSearch/configurable-search-sets
(WIP) Configurable domain ranking sets
This commit is contained in:
commit
ca80957143
33 changed files with 818 additions and 258 deletions
|
@ -15,7 +15,7 @@ public class SearchSpecification {
|
|||
/** If present and not empty, limit the search to these domain IDs */
|
||||
public List<Integer> domains;
|
||||
|
||||
public SearchSetIdentifier searchSetIdentifier;
|
||||
public String searchSetIdentifier;
|
||||
|
||||
public final String humanQuery;
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ public class QueryProtobufCodec {
|
|||
builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery));
|
||||
}
|
||||
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier.name());
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(request.getHumanQuery());
|
||||
|
||||
builder.setQuality(convertSpecLimit(query.specs.quality));
|
||||
|
@ -62,7 +62,7 @@ public class QueryProtobufCodec {
|
|||
convertSpecLimit(request.getDomainCount()),
|
||||
request.getDomainIdsList(),
|
||||
IndexProtobufCodec.convertQueryLimits(request.getQueryLimits()),
|
||||
SearchSetIdentifier.valueOf(request.getSearchSetIdentifier()));
|
||||
request.getSearchSetIdentifier());
|
||||
}
|
||||
|
||||
|
||||
|
@ -133,7 +133,7 @@ public class QueryProtobufCodec {
|
|||
return new SearchSpecification(
|
||||
subqueries,
|
||||
specs.getDomainsList(),
|
||||
SearchSetIdentifier.valueOf(specs.getSearchSetIdentifier()),
|
||||
specs.getSearchSetIdentifier(),
|
||||
specs.getHumanQuery(),
|
||||
IndexProtobufCodec.convertSpecLimit(specs.getQuality()),
|
||||
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
|
||||
|
@ -159,7 +159,7 @@ public class QueryProtobufCodec {
|
|||
.setYear(convertSpecLimit(params.year()))
|
||||
.setSize(convertSpecLimit(params.size()))
|
||||
.setRank(convertSpecLimit(params.rank()))
|
||||
.setSearchSetIdentifier(params.identifier().name());
|
||||
.setSearchSetIdentifier(params.identifier());
|
||||
|
||||
if (params.nearDomain() != null)
|
||||
builder.setNearDomain(params.nearDomain());
|
||||
|
|
|
@ -23,10 +23,10 @@ public record QueryParams(
|
|||
SpecificationLimit domainCount,
|
||||
List<Integer> domainIds,
|
||||
QueryLimits limits,
|
||||
SearchSetIdentifier identifier
|
||||
String identifier
|
||||
)
|
||||
{
|
||||
public QueryParams(String query, QueryLimits limits, SearchSetIdentifier identifier) {
|
||||
public QueryParams(String query, QueryLimits limits, String identifier) {
|
||||
this(query, null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
|
|
|
@ -0,0 +1,166 @@
|
|||
package nu.marginalia.db;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.With;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class DomainRankingSetsService {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainRankingSetsService.class);
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
@Inject
|
||||
public DomainRankingSetsService(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public Optional<DomainRankingSet> get(String name) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION
|
||||
FROM CONF_DOMAIN_RANKING_SET
|
||||
WHERE NAME = ?
|
||||
""")) {
|
||||
stmt.setString(1, name);
|
||||
var rs = stmt.executeQuery();
|
||||
|
||||
if (!rs.next()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(new DomainRankingSet(
|
||||
rs.getString("NAME"),
|
||||
rs.getString("DESCRIPTION"),
|
||||
DomainSetAlgorithm.valueOf(rs.getString("ALGORITHM")),
|
||||
rs.getInt("DEPTH"),
|
||||
rs.getString("DEFINITION")
|
||||
));
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to get domain set", ex);
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public void upsert(DomainRankingSet domainRankingSet) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
REPLACE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, domainRankingSet.name());
|
||||
stmt.setString(2, domainRankingSet.description());
|
||||
stmt.setString(3, domainRankingSet.algorithm().name());
|
||||
stmt.setInt(4, domainRankingSet.depth());
|
||||
stmt.setString(5, domainRankingSet.definition());
|
||||
stmt.executeUpdate();
|
||||
|
||||
if (!conn.getAutoCommit())
|
||||
conn.commit();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to update domain set", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void delete(DomainRankingSet domainRankingSet) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
DELETE FROM CONF_DOMAIN_RANKING_SET
|
||||
WHERE NAME = ?
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, domainRankingSet.name());
|
||||
stmt.executeUpdate();
|
||||
|
||||
if (!conn.getAutoCommit())
|
||||
conn.commit();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to delete domain set", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public List<DomainRankingSet> getAll() {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION
|
||||
FROM CONF_DOMAIN_RANKING_SET
|
||||
""")) {
|
||||
var rs = stmt.executeQuery();
|
||||
List<DomainRankingSet> ret = new ArrayList<>();
|
||||
|
||||
while (rs.next()) {
|
||||
ret.add(
|
||||
new DomainRankingSet(
|
||||
rs.getString("NAME"),
|
||||
rs.getString("DESCRIPTION"),
|
||||
DomainSetAlgorithm.valueOf(rs.getString("ALGORITHM")),
|
||||
rs.getInt("DEPTH"),
|
||||
rs.getString("DEFINITION"))
|
||||
);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to get domain set", ex);
|
||||
return List.of();
|
||||
}
|
||||
}
|
||||
|
||||
public enum DomainSetAlgorithm {
|
||||
/** Use link graph, do a pagerank */
|
||||
LINKS_PAGERANK,
|
||||
/** Use link graph, do a cheirank */
|
||||
LINKS_CHEIRANK,
|
||||
/** Use adjacency graph, do a pagerank */
|
||||
ADJACENCY_PAGERANK,
|
||||
/** Use adjacency graph, do a cheirank */
|
||||
ADJACENCY_CHEIRANK,
|
||||
/** For reserved names. Use special algorithm, function of name */
|
||||
SPECIAL
|
||||
};
|
||||
|
||||
/** Defines a domain ranking set, parameters for the ranking algorithms.
|
||||
*
|
||||
* @param name Key and name of the set
|
||||
* @param description Human-readable description
|
||||
* @param algorithm Algorithm to use
|
||||
* @param depth Depth of the algorithm
|
||||
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
||||
* */
|
||||
@With
|
||||
public record DomainRankingSet(String name,
|
||||
String description,
|
||||
DomainSetAlgorithm algorithm,
|
||||
int depth,
|
||||
String definition)
|
||||
{
|
||||
|
||||
public Path fileName(Path base) {
|
||||
return base.resolve(name().toLowerCase() + ".dat");
|
||||
}
|
||||
public String[] domains() {
|
||||
return Arrays.stream(definition().split("\n+"))
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isBlank())
|
||||
.filter(s -> !s.startsWith("#"))
|
||||
.toArray(String[]::new);
|
||||
}
|
||||
|
||||
public boolean isSpecial() {
|
||||
return algorithm() == DomainSetAlgorithm.SPECIAL;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
|
||||
CREATE TABLE IF NOT EXISTS CONF_DOMAIN_RANKING_SET (
|
||||
NAME VARCHAR(255) PRIMARY KEY COLLATE utf8mb4_unicode_ci,
|
||||
DESCRIPTION VARCHAR(255) NOT NULL,
|
||||
ALGORITHM VARCHAR(255) NOT NULL,
|
||||
DEPTH INT NOT NULL,
|
||||
DEFINITION LONGTEXT NOT NULL
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
INSERT IGNORE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION) VALUES ('NONE', 'Reserved: No Ranking Algorithm', 'SPECIAL', 50000, '');
|
||||
INSERT IGNORE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION) VALUES ('BLOGS', 'Reserved: Blogs Set', 'SPECIAL', 50000, '');
|
||||
INSERT IGNORE INTO CONF_DOMAIN_RANKING_SET(NAME, DESCRIPTION, ALGORITHM, DEPTH, DEFINITION) VALUES ('RANK', 'Reserved: Main Domain Ranking', 'SPECIAL', 50000, '');
|
|
@ -0,0 +1,84 @@
|
|||
package nu.marginalia.db;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
@Testcontainers
|
||||
@Tag("slow")
|
||||
class DomainRankingSetsServiceTest {
|
||||
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() {
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
|
||||
TestMigrationLoader.flywayMigration(dataSource);
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
static void tearDownAll() {
|
||||
dataSource.close();
|
||||
mariaDBContainer.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testScenarios() throws Exception {
|
||||
var service = new DomainRankingSetsService(dataSource);
|
||||
|
||||
// Clean up default values
|
||||
service.get("BLOGS").ifPresent(service::delete);
|
||||
service.get("NONE").ifPresent(service::delete);
|
||||
|
||||
var newValue = new DomainRankingSetsService.DomainRankingSet(
|
||||
"test",
|
||||
"Test domain set",
|
||||
DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_CHEIRANK,
|
||||
10,
|
||||
"test\\.nu"
|
||||
);
|
||||
var newValue2 = new DomainRankingSetsService.DomainRankingSet(
|
||||
"test2",
|
||||
"Test domain set 2",
|
||||
DomainRankingSetsService.DomainSetAlgorithm.LINKS_PAGERANK,
|
||||
20,
|
||||
"test\\.nu 2"
|
||||
);
|
||||
service.upsert(newValue);
|
||||
service.upsert(newValue2);
|
||||
assertEquals(newValue, service.get("test").orElseThrow());
|
||||
|
||||
var allValues = service.getAll();
|
||||
assertEquals(2, allValues.size());
|
||||
assertTrue(allValues.contains(newValue));
|
||||
assertTrue(allValues.contains(newValue2));
|
||||
|
||||
service.delete(newValue);
|
||||
assertFalse(service.get("test").isPresent());
|
||||
|
||||
service.delete(newValue2);
|
||||
assertFalse(service.get("test2").isPresent());
|
||||
|
||||
allValues = service.getAll();
|
||||
assertEquals(0, allValues.size());
|
||||
}
|
||||
}
|
|
@ -55,7 +55,7 @@ public class ApiSearchOperator {
|
|||
Math.min(100, count),
|
||||
150,
|
||||
8192),
|
||||
searchSet);
|
||||
searchSet.name());
|
||||
}
|
||||
|
||||
private SearchSetIdentifier selectSearchSet(int index) {
|
||||
|
|
|
@ -35,7 +35,7 @@ public class SearchQueryParamFactory {
|
|||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(1, 25, 200, 8192),
|
||||
profile.searchSetIdentifier
|
||||
profile.searchSetIdentifier.name()
|
||||
);
|
||||
|
||||
}
|
||||
|
@ -54,7 +54,7 @@ public class SearchQueryParamFactory {
|
|||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(count, count, 100, 512),
|
||||
SearchSetIdentifier.NONE
|
||||
SearchSetIdentifier.NONE.name()
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -72,7 +72,7 @@ public class SearchQueryParamFactory {
|
|||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(100, 100, 100, 512),
|
||||
SearchSetIdentifier.NONE
|
||||
SearchSetIdentifier.NONE.name()
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -90,7 +90,7 @@ public class SearchQueryParamFactory {
|
|||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(100, 100, 100, 512),
|
||||
SearchSetIdentifier.NONE
|
||||
SearchSetIdentifier.NONE.name()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,7 +53,9 @@ public class ControlService extends Service {
|
|||
RandomExplorationService randomExplorationService,
|
||||
DataSetsService dataSetsService,
|
||||
ControlNodeService controlNodeService,
|
||||
ControlActorService controlActorService
|
||||
ControlDomainRankingSetsService controlDomainRankingSetsService,
|
||||
ControlActorService controlActorService,
|
||||
ControlErrorHandler errorHandler
|
||||
) throws IOException {
|
||||
|
||||
super(params);
|
||||
|
@ -66,6 +68,7 @@ public class ControlService extends Service {
|
|||
messageQueueService.register();
|
||||
sysActionsService.register();
|
||||
dataSetsService.register();
|
||||
controlDomainRankingSetsService.register();
|
||||
|
||||
// node
|
||||
controlFileStorageService.register();
|
||||
|
@ -79,6 +82,8 @@ public class ControlService extends Service {
|
|||
domainComplaintService.register();
|
||||
randomExplorationService.register();
|
||||
|
||||
errorHandler.register();
|
||||
|
||||
var indexRenderer = rendererFactory.renderer("control/index");
|
||||
var eventsRenderer = rendererFactory.renderer("control/sys/events");
|
||||
var serviceByIdRenderer = rendererFactory.renderer("control/sys/service-by-id");
|
||||
|
@ -104,6 +109,7 @@ public class ControlService extends Service {
|
|||
|
||||
Spark.get("/public/:resource", this::serveStatic);
|
||||
|
||||
|
||||
monitors.subscribe(this::logMonitorStateChange);
|
||||
|
||||
controlActorService.startDefaultActors();
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
package nu.marginalia.control;
|
||||
|
||||
public class ControlValidationError extends RuntimeException {
|
||||
public final String title;
|
||||
public final String messageLong;
|
||||
public final String redirect;
|
||||
|
||||
public ControlValidationError(String title, String messageLong, String redirect) {
|
||||
super(title);
|
||||
|
||||
this.title = title;
|
||||
this.messageLong = messageLong;
|
||||
this.redirect = redirect;
|
||||
}
|
||||
}
|
|
@ -8,6 +8,7 @@ public class Redirects {
|
|||
public static final HtmlRedirect redirectToOverview = new HtmlRedirect("/");
|
||||
public static final HtmlRedirect redirectToBlacklist = new HtmlRedirect("/blacklist");
|
||||
public static final HtmlRedirect redirectToComplaints = new HtmlRedirect("/complaints");
|
||||
public static final HtmlRedirect redirectToRankingDataSets = new HtmlRedirect("/domain-ranking-sets");
|
||||
public static final HtmlRedirect redirectToMessageQueue = new HtmlRedirect("/message-queue");
|
||||
|
||||
public static class HtmlRedirect implements ResponseTransformer {
|
||||
|
|
|
@ -79,7 +79,7 @@ public class SearchToBanService {
|
|||
private Object executeQuery(Context ctx, String query) {
|
||||
return queryClient.search(ctx, new QueryParams(
|
||||
query, new QueryLimits(2, 200, 250, 8192),
|
||||
SearchSetIdentifier.NONE
|
||||
"NONE"
|
||||
));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -360,22 +360,29 @@ public class ControlNodeService {
|
|||
}
|
||||
|
||||
private List<FileStorageBaseWithStorage> makeFileStorageBaseWithStorage(List<FileStorageId> storageIds) throws SQLException {
|
||||
|
||||
Map<FileStorageBaseId, FileStorageBase> fileStorageBaseByBaseId = new HashMap<>();
|
||||
Map<FileStorageBaseId, List<FileStorageWithActions>> fileStoragByBaseId = new HashMap<>();
|
||||
Map<FileStorageBaseId, List<FileStorageWithActions>> fileStorageByBaseId = new HashMap<>();
|
||||
|
||||
for (var id : storageIds) {
|
||||
var storage = fileStorageService.getStorage(id);
|
||||
fileStorageBaseByBaseId.computeIfAbsent(storage.base().id(), k -> storage.base());
|
||||
fileStoragByBaseId.computeIfAbsent(storage.base().id(), k -> new ArrayList<>()).add(new FileStorageWithActions(storage));
|
||||
fileStorageByBaseId.computeIfAbsent(storage.base().id(), k -> new ArrayList<>()).add(new FileStorageWithActions(storage));
|
||||
}
|
||||
|
||||
List<FileStorageBaseWithStorage> result = new ArrayList<>();
|
||||
for (var baseId : fileStorageBaseByBaseId.keySet()) {
|
||||
result.add(new FileStorageBaseWithStorage(fileStorageBaseByBaseId.get(baseId),
|
||||
fileStoragByBaseId.get(baseId)
|
||||
|
||||
));
|
||||
for (var baseId : fileStorageBaseByBaseId.keySet()) {
|
||||
var base = fileStorageBaseByBaseId.get(baseId);
|
||||
var items = fileStorageByBaseId.get(baseId);
|
||||
|
||||
// Sort by timestamp, then by relPath
|
||||
// this ensures that the newest file is listed last
|
||||
items.sort(Comparator
|
||||
.comparing(FileStorageWithActions::getTimestamp)
|
||||
.thenComparing(FileStorageWithActions::getRelPath)
|
||||
);
|
||||
|
||||
result.add(new FileStorageBaseWithStorage(base, items));
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
|
@ -0,0 +1,111 @@
|
|||
package nu.marginalia.control.sys.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.control.ControlRendererFactory;
|
||||
import nu.marginalia.control.ControlValidationError;
|
||||
import nu.marginalia.control.Redirects;
|
||||
import nu.marginalia.db.DomainRankingSetsService;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Map;
|
||||
|
||||
public class ControlDomainRankingSetsService {
|
||||
private final HikariDataSource dataSource;
|
||||
private final ControlRendererFactory rendererFactory;
|
||||
private final DomainRankingSetsService domainRankingSetsService;
|
||||
|
||||
@Inject
|
||||
public ControlDomainRankingSetsService(HikariDataSource dataSource,
|
||||
ControlRendererFactory rendererFactory,
|
||||
DomainRankingSetsService domainRankingSetsService) {
|
||||
this.dataSource = dataSource;
|
||||
this.rendererFactory = rendererFactory;
|
||||
this.domainRankingSetsService = domainRankingSetsService;
|
||||
}
|
||||
|
||||
public void register() throws IOException {
|
||||
var datasetsRenderer = rendererFactory.renderer("control/sys/domain-ranking-sets");
|
||||
var updateDatasetRenderer = rendererFactory.renderer("control/sys/update-domain-ranking-set");
|
||||
var newDatasetRenderer = rendererFactory.renderer("control/sys/new-domain-ranking-set");
|
||||
|
||||
Spark.get("/public/domain-ranking-sets", this::rankingSetsModel, datasetsRenderer::render);
|
||||
Spark.get("/public/domain-ranking-sets/new", (rq,rs) -> new Object(), newDatasetRenderer::render);
|
||||
Spark.get("/public/domain-ranking-sets/:id", this::rankingSetModel, updateDatasetRenderer::render);
|
||||
Spark.post("/public/domain-ranking-sets/:id", this::alterSetModel, Redirects.redirectToRankingDataSets);
|
||||
}
|
||||
|
||||
private Object alterSetModel(Request request, Response response) throws SQLException {
|
||||
final String act = request.queryParams("act");
|
||||
final String id = request.params("id");
|
||||
|
||||
if ("update".equals(act)) {
|
||||
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
|
||||
id,
|
||||
request.queryParams("description"),
|
||||
DomainRankingSetsService.DomainSetAlgorithm.valueOf(request.queryParams("algorithm")),
|
||||
Integer.parseInt(request.queryParams("depth")),
|
||||
request.queryParams("definition")
|
||||
));
|
||||
return "";
|
||||
}
|
||||
else if ("delete".equals(act)) {
|
||||
var model = domainRankingSetsService.get(id).orElseThrow();
|
||||
if (model.isSpecial()) {
|
||||
throw new ControlValidationError("Cannot delete special ranking set",
|
||||
"""
|
||||
SPECIAL data sets are reserved by the system and can not be deleted.
|
||||
""",
|
||||
"/domain-ranking-sets");
|
||||
}
|
||||
domainRankingSetsService.delete(model);
|
||||
return "";
|
||||
}
|
||||
else if ("create".equals(act)) {
|
||||
if (domainRankingSetsService.get(request.queryParams("name")).isPresent()) {
|
||||
throw new ControlValidationError("Ranking set with that name already exists",
|
||||
"""
|
||||
Ensure the new data set has a unique name and try again.
|
||||
""",
|
||||
"/domain-ranking-sets");
|
||||
}
|
||||
|
||||
domainRankingSetsService.upsert(new DomainRankingSetsService.DomainRankingSet(
|
||||
request.queryParams("name").toUpperCase(),
|
||||
request.queryParams("description"),
|
||||
DomainRankingSetsService.DomainSetAlgorithm.valueOf(request.queryParams("algorithm")),
|
||||
Integer.parseInt(request.queryParams("depth")),
|
||||
request.queryParams("definition")
|
||||
));
|
||||
return "";
|
||||
}
|
||||
|
||||
throw new ControlValidationError("Unknown action", """
|
||||
An unknown action was requested and the system does not understand how to act on it.
|
||||
""",
|
||||
"/domain-ranking-sets");
|
||||
}
|
||||
|
||||
private Object rankingSetsModel(Request request, Response response) {
|
||||
return Map.of("rankingSets", domainRankingSetsService.getAll());
|
||||
}
|
||||
private Object rankingSetModel(Request request, Response response) throws SQLException {
|
||||
var model = domainRankingSetsService.get(request.params("id")).orElseThrow();
|
||||
return Map.of("rankingSet", model,
|
||||
"selectedAlgo", Map.of(
|
||||
"special", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.SPECIAL,
|
||||
"adjacency_cheirank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_CHEIRANK,
|
||||
"adjacency_pagerank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.ADJACENCY_PAGERANK,
|
||||
"links_cheirank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.LINKS_CHEIRANK,
|
||||
"links_pagerank", model.algorithm() == DomainRankingSetsService.DomainSetAlgorithm.LINKS_PAGERANK)
|
||||
);
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
package nu.marginalia.control.sys.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.control.ControlRendererFactory;
|
||||
import nu.marginalia.control.ControlValidationError;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
public class ControlErrorHandler {
|
||||
private final ControlRendererFactory.Renderer renderer;
|
||||
|
||||
@Inject
|
||||
public ControlErrorHandler(ControlRendererFactory rendererFactory) {
|
||||
this.renderer = rendererFactory.renderer("control/error");
|
||||
}
|
||||
|
||||
public void render(ControlValidationError error, Request request, Response response) {
|
||||
String text = renderer.render(
|
||||
Map.of(
|
||||
"title", error.title,
|
||||
"messageLong", error.messageLong,
|
||||
"redirect", error.redirect
|
||||
)
|
||||
);
|
||||
|
||||
response.body(text);
|
||||
}
|
||||
|
||||
public void register() {
|
||||
Spark.exception(ControlValidationError.class, this::render);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Control Service: Error</title>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<link rel="stylesheet" href="/style.css" />
|
||||
{{> control/partials/head-includes }}
|
||||
</head>
|
||||
<body>
|
||||
{{> control/partials/nav}}
|
||||
<div class="container">
|
||||
<h1 class="my-3">Error: {{title}}</h1>
|
||||
<div class="my-3 p-3 border bg-light">
|
||||
<p>{{messageLong}}</p>
|
||||
<a href="{{redirect}}">Go back</a>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
{{> control/partials/foot-includes }}
|
||||
<script>
|
||||
window.setInterval(() => {
|
||||
refresh(["processes", "services", "jobs", "events"]);
|
||||
}, 2000);
|
||||
</script>
|
||||
</html>
|
|
@ -34,7 +34,8 @@
|
|||
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">System</a>
|
||||
<ul class="dropdown-menu">
|
||||
<li><a class="dropdown-item" href="/actions" title="System actions">Actions</a></li>
|
||||
<li><a class="dropdown-item" href="/datasets" title="View and update the data sets">Datasets</a></li>
|
||||
<li><a class="dropdown-item" href="/datasets" title="View and update the data sets">Data Sets</a></li>
|
||||
<li><a class="dropdown-item" href="/domain-ranking-sets" title="View and update domain rankings ">Domain Ranking Sets</a></li>
|
||||
<li><a class="dropdown-item" href="/events" title="View the event log">Events</a></li>
|
||||
<li><a class="dropdown-item" href="/message-queue" title="View or manipulate the system message queue">Message Queue</a></li>
|
||||
</ul>
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Control Service</title>
|
||||
{{> control/partials/head-includes }}
|
||||
</head>
|
||||
<body>
|
||||
{{> control/partials/nav}}
|
||||
<div class="container">
|
||||
<h1 class="my-3">Domain Ranking Sets</h1>
|
||||
<div class="border my-3 p-3 bg-light">
|
||||
Domain ranking sets configure the ranking algorithms used to determine the importance of a domain.
|
||||
</div>
|
||||
|
||||
<table class="table">
|
||||
<tr>
|
||||
<th>Name</th>
|
||||
<th>Description</th>
|
||||
<th>Algorithm</th>
|
||||
<th>Depth</th>
|
||||
</tr>
|
||||
{{#each rankingSets}}
|
||||
<tr>
|
||||
<td><a href="/domain-ranking-sets/{{name}}">{{name}}</td></td>
|
||||
<td>{{description}}</td>
|
||||
<td>{{algorithm}}</td>
|
||||
<td>{{depth}}</td>
|
||||
</tr>
|
||||
{{/each}}
|
||||
</table>
|
||||
|
||||
<div class="my-3">
|
||||
<a href="/domain-ranking-sets/new" class="btn btn-primary">New Domain Ranking Set</a>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="border my-3 p-3 bg-light">
|
||||
<p>Several reserved ranking sets are available for use in the query parameters.</p>
|
||||
<dl>
|
||||
<dt>NONE</dt><dd>Placeholder for no restriction on the domains returned.
|
||||
Does nothing, and exists only to prevent a new ranking
|
||||
set from being created with this name.</dd>
|
||||
<dt>RANK</dt><dd>Used to calculate the domain ranking for a given domain.
|
||||
This affects the order they are stored in the index, and increases the odds they'll
|
||||
even be considered within the time restrictions of the query.</dd>
|
||||
<dt>BLOGS</dt><dd>Returns a fixed list of domains, configurable in <a href="/datasets">Datasets</a>.
|
||||
Changes to this list will not be reflected in the index until the next time the index is rebuilt.</dd>
|
||||
</dl>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
{{> control/partials/foot-includes }}
|
||||
</html>
|
|
@ -0,0 +1,75 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Control Service</title>
|
||||
{{> control/partials/head-includes }}
|
||||
</head>
|
||||
<body>
|
||||
{{> control/partials/nav}}
|
||||
<div class="container">
|
||||
<h1 class="my-3">Create Domain Ranking Set</h1>
|
||||
<form method="post" action="?act=create">
|
||||
<table class="table">
|
||||
<tr>
|
||||
<th><label for="name">Name</label></th>
|
||||
<td>
|
||||
<input pattern="\w+" type="text" value="{{name}}" id="name" name="name" style="text-transform: uppercase" />
|
||||
<div>
|
||||
<small class="text-muted">Must be all letters.
|
||||
The name is how the ranking set is identified in the query parameters,
|
||||
and also decides the file name of the persisted ranking set definition. Keep it simple.</small>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><label for="algorithm">Algorithm</label></th>
|
||||
<td>
|
||||
<select id="algorithm" name="algorithm">
|
||||
<option value="LINKS_PAGERANK">LINKS_PAGERANK</option>
|
||||
<option value="LINKS_CHEIRANK">LINKS_CHEIRANK</option>
|
||||
<option value="ADJACENCY_PAGERANK">ADJACENCY_PAGERANK</option>
|
||||
<option value="ADJACENCY_CHEIRANK">ADJACENCY_CHEIRANK</option>
|
||||
</select>
|
||||
<div>
|
||||
<small class="text-muted">
|
||||
The algorithm used to rank the domains. The LINKS algorithms use the link graph, and the ADJACENCY
|
||||
algorithms use the adjacency graph. CheiRank is a variant of PageRank that uses the reversed graph.
|
||||
</small>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><label for="description">Description</label></th>
|
||||
<td>
|
||||
<input type="text" value="{{description}}" id="description" name="description" {{#if special}}disabled{{/if}} />
|
||||
<div>
|
||||
<small class="text-muted">This is purely to help keep track of what this ranking set does.</small>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><label for="depth">Depth</label></th>
|
||||
<td>
|
||||
<input pattern="\d+" type="text" value="{{depth}}" id="depth" name="depth" />
|
||||
<div>
|
||||
<small class="text-muted">Number. Up to this number of domains are ranked, the rest are excluded.</small>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr><th colspan="2"><label for="definition">Definition</label></th></tr>
|
||||
<tr><td colspan="2">
|
||||
<textarea name="definition" id="definition" rows="10" style="width: 100%">{{definition}}</textarea>
|
||||
<div>
|
||||
<small class="text-muted">A list of domain names, one per line, possibly globbed with SQL-style '%' wildcards.
|
||||
These are used as the origin point for the Personalized PageRank algorithm, and will be considered
|
||||
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used, as per the PageRank paper.
|
||||
</small>
|
||||
</div>
|
||||
</td></tr>
|
||||
</table>
|
||||
<button type="submit" class="btn btn-primary">Create</button>
|
||||
</form>
|
||||
</div>
|
||||
</body>
|
||||
{{> control/partials/foot-includes }}
|
||||
</html>
|
|
@ -0,0 +1,88 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Control Service</title>
|
||||
{{> control/partials/head-includes }}
|
||||
</head>
|
||||
<body>
|
||||
{{> control/partials/nav}}
|
||||
<div class="container">
|
||||
{{#with rankingSet}}
|
||||
<h1 class="my-3">Domain Ranking Set: {{name}}</h1>
|
||||
<form method="post" action="?act=update" id="update-form">
|
||||
<table class="table">
|
||||
<tr>
|
||||
<th><label for="name">Name</label></th>
|
||||
<td>
|
||||
{{#if special}}<input type="hidden" name="name" value="{{name}}" />{{/if}}
|
||||
<input type="text" value="{{name}}" id="name" name="name" {{#if special}}disabled{{/if}} />
|
||||
<div>
|
||||
<small class="text-muted">The name is how the ranking set is identified in the query parameters,
|
||||
and also decides the file name of the persisted ranking set definition. Keep it simple.</small>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><label for="algorithm">Algorithm</label></th>
|
||||
<td>
|
||||
{{#if special}}<input type="hidden" name="algorithm" value="{{algorithm}}" />{{/if}}
|
||||
<select id="algorithm" name="algorithm" {{#if special}}disabled{{/if}}>
|
||||
{{#with algorithm}}
|
||||
<option value="SPECIAL" disabled {{#if selectedAlgo.special}}selected{{/if}}>SPECIAL</option>
|
||||
<option value="LINKS_PAGERANK" {{#if selectedAlgo.links_pagerank}}selected{{/if}}>LINKS_PAGERANK</option>
|
||||
<option value="LINKS_CHEIRANK" {{#if selectedAlgo.links_cheirank}}selected{{/if}}>LINKS_CHEIRANK</option>
|
||||
<option value="ADJACENCY_PAGERANK" {{#if selectedAlgo.adjacency_pagerank}}selected{{/if}}>ADJACENCY_PAGERANK</option>
|
||||
<option value="ADJACENCY_CHEIRANK" {{#if selectedAlgo.adjacency_cheirank}}selected{{/if}}>ADJACENCY_CHEIRANK</option>
|
||||
{{/with}}
|
||||
</select>
|
||||
<div>
|
||||
<small class="text-muted">
|
||||
The algorithm used to rank the domains. The LINKS algorithms use the link graph, and the ADJACENCY
|
||||
algorithms use the adjacency graph. CheiRank is a variant of PageRank that uses the reversed graph.
|
||||
</small>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><label for="description">Description</label></th>
|
||||
<td>
|
||||
{{#if special}}<input type="hidden" name="description" value="{{description}}" />{{/if}}
|
||||
<input type="text" value="{{description}}" id="description" name="description" {{#if special}}disabled{{/if}} />
|
||||
<div>
|
||||
<small class="text-muted">This is purely to help keep track of what this ranking set does.</small>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><label for="depth">Depth</label></th>
|
||||
<td>
|
||||
<input pattern="\d+" type="text" value="{{depth}}" id="depth" name="depth" />
|
||||
<div>
|
||||
<small class="text-muted">Number. Up to this number of domains are ranked, the rest are excluded.</small>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr><th colspan="2"><label for="definition">Definition</label></th></tr>
|
||||
<tr><td colspan="2">
|
||||
<textarea name="definition" id="definition" rows="10" style="width: 100%">{{definition}}</textarea>
|
||||
<div>
|
||||
<small class="text-muted">A list of domain names, one per line, possibly globbed with SQL-style '%' wildcards.
|
||||
These are used as the origin point for the Personalized PageRank algorithm, and will be considered
|
||||
the central points of the link or adjacency graph. If no domains are specified, the entire domain space is used, as per the PageRank paper.
|
||||
</small>
|
||||
</div>
|
||||
</td></tr>
|
||||
</table>
|
||||
|
||||
</form>
|
||||
<form method="post" action="?act=delete" id="delete-form"></form>
|
||||
|
||||
<button type="submit" class="btn btn-danger" form="delete-form" style="float:right" {{#if special}}disabled title="Cannot delete special sets!"{{/if}} onclick="return confirm('Confirm deletion of ranking set')">Delete</button>
|
||||
<button type="submit" class="btn btn-primary" form="update-form">Update</button>
|
||||
|
||||
|
||||
{{/with}}
|
||||
</div>
|
||||
</body>
|
||||
{{> control/partials/foot-includes }}
|
||||
</html>
|
|
@ -6,14 +6,10 @@ import com.google.inject.Singleton;
|
|||
import com.google.inject.name.Named;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.linkdb.dlinks.DomainLinkDb;
|
||||
import nu.marginalia.linkdb.dlinks.FileDomainLinkDb;
|
||||
import nu.marginalia.linkdb.dlinks.SelectingDomainLinkDb;
|
||||
import nu.marginalia.linkdb.dlinks.SqlDomainLinkDb;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.config.RankingSettings;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -30,12 +26,6 @@ public class IndexModule extends AbstractModule {
|
|||
public void configure() {
|
||||
}
|
||||
|
||||
@Provides
|
||||
public RankingSettings rankingSettings() {
|
||||
Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml");
|
||||
return RankingSettings.from(dir);
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Singleton
|
||||
public DomainLinkDb domainLinkDb (
|
||||
|
|
|
@ -1,26 +0,0 @@
|
|||
package nu.marginalia.index.config;
|
||||
|
||||
import lombok.ToString;
|
||||
import org.yaml.snakeyaml.Yaml;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
@ToString
|
||||
public class RankingSettings {
|
||||
public RankingSettingsEntry small;
|
||||
public RankingSettingsEntry retro;
|
||||
public RankingSettingsEntry standard;
|
||||
public RankingSettingsEntry academia;
|
||||
public RankingSettingsEntry ranking;
|
||||
|
||||
public static RankingSettings from(Path dir) {
|
||||
try {
|
||||
return new Yaml().loadAs(Files.readString(dir), RankingSettings.class);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to load " + dir, ex);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,11 +0,0 @@
|
|||
package nu.marginalia.index.config;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class RankingSettingsEntry {
|
||||
/** Bias the ranking toward these domains */
|
||||
public List<String> domains;
|
||||
|
||||
/** Number of domains to include in ranking */
|
||||
public int max;
|
||||
}
|
|
@ -43,13 +43,16 @@ public class IndexOpsService {
|
|||
if (!run(searchSetService::recalculateAll)) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
|
||||
return "OK";
|
||||
}
|
||||
|
||||
public Object reindexEndpoint(Request request, Response response) throws Exception {
|
||||
|
||||
if (!run(index::switchIndex).isPresent()) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
|
||||
return "OK";
|
||||
}
|
||||
|
||||
|
|
|
@ -261,9 +261,7 @@ public class IndexQueryService extends IndexApiImplBase {
|
|||
return new SmallSearchSet(request.getDomainsList());
|
||||
}
|
||||
|
||||
return searchSetsService.getSearchSetByName(
|
||||
SearchSetIdentifier.valueOf(request.getSearchSetIdentifier())
|
||||
);
|
||||
return searchSetsService.getSearchSetByName(request.getSearchSetIdentifier());
|
||||
}
|
||||
private SearchResultSet executeSearch(SearchParameters params) throws SQLException {
|
||||
|
||||
|
|
|
@ -4,10 +4,11 @@ import com.google.inject.Inject;
|
|||
import com.google.inject.Singleton;
|
||||
import gnu.trove.list.TIntList;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.db.DomainRankingSetsService;
|
||||
import nu.marginalia.db.DomainTypes;
|
||||
import nu.marginalia.index.IndexServicesFactory;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
import nu.marginalia.ranking.RankingAlgorithm;
|
||||
import nu.marginalia.ranking.ReversePageRank;
|
||||
import nu.marginalia.ranking.StandardPageRank;
|
||||
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||
|
@ -16,31 +17,32 @@ import nu.marginalia.ranking.data.RankingDomainFetcher;
|
|||
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||
import nu.marginalia.index.svc.searchset.RankingSearchSet;
|
||||
import nu.marginalia.index.svc.searchset.SearchSetAny;
|
||||
import nu.marginalia.index.config.RankingSettings;
|
||||
import nu.marginalia.ranking.DomainRankings;
|
||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.index.db.DbUpdateRanks;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
@Singleton
|
||||
public class IndexSearchSetsService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final DomainTypes domainTypes;
|
||||
private final ServiceHeartbeat heartbeat;
|
||||
private final IndexServicesFactory indexServicesFactory;
|
||||
private final ServiceEventLog eventLog;
|
||||
private final DomainRankingSetsService domainRankingSetsService;
|
||||
private final DbUpdateRanks dbUpdateRanks;
|
||||
private final RankingDomainFetcher similarityDomains;
|
||||
private final RankingSettings rankingSettings;
|
||||
|
||||
private final RankingDomainFetcher linksDomains;
|
||||
|
||||
private final ConcurrentHashMap<String, SearchSet> rankingSets = new ConcurrentHashMap<>();
|
||||
// Below are binary indices that are used to constrain a search
|
||||
private volatile RankingSearchSet popularSet;
|
||||
private volatile RankingSearchSet smallWebSet;
|
||||
private volatile RankingSearchSet academiaSet;
|
||||
private volatile RankingSearchSet blogsSet;
|
||||
private final SearchSet anySet = new SearchSetAny();
|
||||
|
||||
// The ranking value of the domains used in sorting the domains
|
||||
|
@ -51,83 +53,121 @@ public class IndexSearchSetsService {
|
|||
ServiceHeartbeat heartbeat,
|
||||
RankingDomainFetcher rankingDomains,
|
||||
RankingDomainFetcherForSimilarityData similarityDomains,
|
||||
RankingSettings rankingSettings,
|
||||
IndexServicesFactory servicesFactory,
|
||||
IndexServicesFactory indexServicesFactory,
|
||||
ServiceEventLog eventLog,
|
||||
DomainRankingSetsService domainRankingSetsService,
|
||||
DbUpdateRanks dbUpdateRanks) throws IOException {
|
||||
this.domainTypes = domainTypes;
|
||||
this.heartbeat = heartbeat;
|
||||
this.indexServicesFactory = indexServicesFactory;
|
||||
this.eventLog = eventLog;
|
||||
this.domainRankingSetsService = domainRankingSetsService;
|
||||
|
||||
this.dbUpdateRanks = dbUpdateRanks;
|
||||
|
||||
if (similarityDomains.hasData()) {
|
||||
this.similarityDomains = similarityDomains;
|
||||
this.linksDomains = rankingDomains;
|
||||
}
|
||||
else {
|
||||
// on test environments the cosine similarity graph may not be present
|
||||
logger.info("Domain similarity is not present, falling back on link graph");
|
||||
this.similarityDomains = rankingDomains;
|
||||
this.linksDomains = rankingDomains;
|
||||
}
|
||||
|
||||
this.rankingSettings = rankingSettings;
|
||||
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
|
||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
|
||||
popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, servicesFactory.getSearchSetsBase().resolve("popular.dat"));
|
||||
blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, servicesFactory.getSearchSetsBase().resolve("blogs.dat"));
|
||||
for (var rankingSet : domainRankingSetsService.getAll()) {
|
||||
rankingSets.put(rankingSet.name(),
|
||||
new RankingSearchSet(rankingSet.name(),
|
||||
rankingSet.fileName(indexServicesFactory.getSearchSetsBase())
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public DomainRankings getDomainRankings() {
|
||||
return domainRankings;
|
||||
}
|
||||
|
||||
public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) {
|
||||
public SearchSet getSearchSetByName(String searchSetIdentifier) {
|
||||
|
||||
if (null == searchSetIdentifier) {
|
||||
return anySet;
|
||||
}
|
||||
return switch (searchSetIdentifier) {
|
||||
case NONE -> anySet;
|
||||
case POPULAR -> popularSet;
|
||||
case ACADEMIA -> academiaSet;
|
||||
case SMALLWEB -> smallWebSet;
|
||||
case BLOGS -> blogsSet;
|
||||
};
|
||||
|
||||
if ("NONE".equals(searchSetIdentifier) || "".equals(searchSetIdentifier)) {
|
||||
return anySet;
|
||||
}
|
||||
|
||||
enum RepartitionSteps {
|
||||
UPDATE_ACADEMIA,
|
||||
UPDATE_POPULAR,
|
||||
UPDATE_SMALL_WEB,
|
||||
UPDATE_BLOGS,
|
||||
UPDATE_RANKINGS,
|
||||
FINISHED
|
||||
return Objects.requireNonNull(rankingSets.get(searchSetIdentifier), "Unknown search set");
|
||||
}
|
||||
|
||||
public void recalculateAll() {
|
||||
try (var processHeartbeat = heartbeat.createServiceTaskHeartbeat(RepartitionSteps.class, "repartitionAll")) {
|
||||
|
||||
processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA);
|
||||
updateAcademiaDomainsSet();
|
||||
|
||||
processHeartbeat.progress(RepartitionSteps.UPDATE_POPULAR);
|
||||
updatePopularDomainsSet();
|
||||
|
||||
processHeartbeat.progress(RepartitionSteps.UPDATE_SMALL_WEB);
|
||||
updateSmallWebDomainsSet();
|
||||
|
||||
processHeartbeat.progress(RepartitionSteps.UPDATE_BLOGS);
|
||||
updateBlogsSet();
|
||||
|
||||
processHeartbeat.progress(RepartitionSteps.UPDATE_RANKINGS);
|
||||
updateDomainRankings();
|
||||
|
||||
processHeartbeat.progress(RepartitionSteps.FINISHED);
|
||||
for (var rankingSet : domainRankingSetsService.getAll()) {
|
||||
try {
|
||||
if (DomainRankingSetsService.DomainSetAlgorithm.SPECIAL.equals(rankingSet.algorithm())) {
|
||||
switch (rankingSet.name()) {
|
||||
case "BLOGS" -> recalculateBlogsSet(rankingSet);
|
||||
case "RANK" -> updateDomainRankings(rankingSet);
|
||||
case "NONE" -> {}
|
||||
}
|
||||
} else {
|
||||
recalculateNornal(rankingSet);
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to recalculate ranking set {}", rankingSet.name(), ex);
|
||||
}
|
||||
eventLog.logEvent("RANKING-SET-RECALCULATED", rankingSet.name());
|
||||
}
|
||||
}
|
||||
|
||||
private void updateDomainRankings() {
|
||||
var entry = rankingSettings.ranking;
|
||||
private void recalculateNornal(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
||||
String[] domains = rankingSet.domains();
|
||||
|
||||
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
||||
var ranks = spr.pageRankWithPeripheralNodes(entry.max, () -> new RankingResultHashMapAccumulator(100_000));
|
||||
RankingAlgorithm rankingAlgorithm = switch (rankingSet.algorithm()) {
|
||||
case LINKS_PAGERANK -> new StandardPageRank(linksDomains, domains);
|
||||
case LINKS_CHEIRANK -> new ReversePageRank(linksDomains, domains);
|
||||
case ADJACENCY_PAGERANK -> new StandardPageRank(similarityDomains, domains);
|
||||
case ADJACENCY_CHEIRANK -> new ReversePageRank(similarityDomains, domains);
|
||||
default -> throw new IllegalStateException("Unexpected value: " + rankingSet.algorithm());
|
||||
};
|
||||
|
||||
var data = rankingAlgorithm.pageRankWithPeripheralNodes(rankingSet.depth(), RankingResultHashSetAccumulator::new);
|
||||
|
||||
var set = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), data);
|
||||
rankingSets.put(rankingSet.name(), set);
|
||||
|
||||
try {
|
||||
set.write();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.warn("Failed to write search set", ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void recalculateBlogsSet(DomainRankingSetsService.DomainRankingSet rankingSet) throws SQLException, IOException {
|
||||
TIntList knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG);
|
||||
|
||||
if (knownDomains.isEmpty()) {
|
||||
// FIXME: We don't want to reload the entire list every time, but we do want to do it sometimes. Actor maybe?
|
||||
domainTypes.reloadDomainsList(DomainTypes.Type.BLOG);
|
||||
knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG);
|
||||
}
|
||||
|
||||
synchronized (this) {
|
||||
var blogSet = new RankingSearchSet(rankingSet.name(), rankingSet.fileName(indexServicesFactory.getSearchSetsBase()), new IntOpenHashSet(knownDomains.toArray()));
|
||||
rankingSets.put(rankingSet.name(), blogSet);
|
||||
blogSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
private void updateDomainRankings(DomainRankingSetsService.DomainRankingSet rankingSet) {
|
||||
|
||||
var spr = new StandardPageRank(similarityDomains, rankingSet.domains());
|
||||
var ranks = spr.pageRankWithPeripheralNodes(rankingSet.depth(), () -> new RankingResultHashMapAccumulator(rankingSet.depth()));
|
||||
|
||||
synchronized (this) {
|
||||
domainRankings = new DomainRankings(ranks);
|
||||
|
@ -138,60 +178,4 @@ public class IndexSearchSetsService {
|
|||
dbUpdateRanks.execute(ranks);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updatePopularDomainsSet() {
|
||||
var entry = rankingSettings.retro;
|
||||
|
||||
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
popularSet = new RankingSearchSet(SearchSetIdentifier.POPULAR, popularSet.source, data);
|
||||
popularSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateSmallWebDomainsSet() {
|
||||
var entry = rankingSettings.small;
|
||||
|
||||
var rpr = new ReversePageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
||||
rpr.setMaxKnownUrls(750);
|
||||
var data = rpr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
||||
smallWebSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateBlogsSet() {
|
||||
TIntList knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG);
|
||||
|
||||
if (knownDomains.isEmpty()) {
|
||||
// FIXME: We don't want to reload the entire list every time, but we do want to do it sometimes. Actor maybe?
|
||||
domainTypes.reloadDomainsList(DomainTypes.Type.BLOG);
|
||||
knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG);
|
||||
}
|
||||
|
||||
synchronized (this) {
|
||||
blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, blogsSet.source, new IntOpenHashSet(knownDomains.toArray()));
|
||||
blogsSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void updateAcademiaDomainsSet() {
|
||||
var entry = rankingSettings.academia;
|
||||
|
||||
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
||||
academiaSet.write();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,17 +24,17 @@ public class RankingSearchSet implements SearchSet {
|
|||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final IntOpenHashSet set;
|
||||
public final SearchSetIdentifier identifier;
|
||||
public final String name;
|
||||
public final Path source;
|
||||
|
||||
public RankingSearchSet(SearchSetIdentifier identifier, Path source, IntOpenHashSet set) {
|
||||
this.identifier = identifier;
|
||||
public RankingSearchSet(String name, Path source, IntOpenHashSet set) {
|
||||
this.name = name;
|
||||
this.source = source;
|
||||
this.set = set;
|
||||
}
|
||||
|
||||
public RankingSearchSet(SearchSetIdentifier identifier, Path source) throws IOException {
|
||||
this.identifier = identifier;
|
||||
public RankingSearchSet(String name, Path source) throws IOException {
|
||||
this.name = name;
|
||||
this.source = source;
|
||||
|
||||
if (!Files.exists(source)) {
|
||||
|
@ -45,7 +45,7 @@ public class RankingSearchSet implements SearchSet {
|
|||
}
|
||||
|
||||
if (set.isEmpty()) {
|
||||
logger.warn("Search set {} is empty", identifier);
|
||||
logger.warn("Search set {} is empty", name);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -87,6 +87,6 @@ public class RankingSearchSet implements SearchSet {
|
|||
}
|
||||
|
||||
public String toString() {
|
||||
return identifier.toString();
|
||||
return name;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
package nu.marginalia.index.model;
|
||||
|
||||
import nu.marginalia.index.config.RankingSettings;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class RankingSettingsTest {
|
||||
|
||||
Path tempFile;
|
||||
@BeforeEach
|
||||
void setUp() throws IOException {
|
||||
tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void tearDown() throws IOException {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testParseRankingSettings() throws IOException {
|
||||
Files.writeString(tempFile, """
|
||||
retro:
|
||||
max: 50
|
||||
domains:
|
||||
- "www.rep.routledge.com"
|
||||
- "www.personal.kent.edu"
|
||||
small:
|
||||
max: 10
|
||||
domains:
|
||||
- "bikobatanari.art"
|
||||
- "wiki.xxiivv.com"
|
||||
academia:
|
||||
max: 101
|
||||
domains:
|
||||
- "%edu"
|
||||
standard:
|
||||
max: 23
|
||||
domains:
|
||||
- "memex.marginalia.nu"
|
||||
""");
|
||||
|
||||
var settings = RankingSettings.from(tempFile);
|
||||
assertEquals(List.of("www.rep.routledge.com","www.personal.kent.edu"), settings.retro.domains);
|
||||
assertEquals(50, settings.retro.max);
|
||||
assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small.domains);
|
||||
assertEquals(10, settings.small.max);
|
||||
assertEquals(List.of("bikobatanari.art","wiki.xxiivv.com"), settings.small.domains);
|
||||
assertEquals(List.of("%edu"), settings.academia.domains);
|
||||
assertEquals(List.of("memex.marginalia.nu"), settings.standard.domains);
|
||||
|
||||
}
|
||||
}
|
|
@ -129,7 +129,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||
.domainCount(SpecificationLimit.none())
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.domains(new ArrayList<>())
|
||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||
.searchSetIdentifier("NONE")
|
||||
.subqueries(List.of(new SearchSubquery(
|
||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
||||
Collections.emptyList()))).build());
|
||||
|
@ -207,7 +207,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||
.rank(SpecificationLimit.none())
|
||||
.domainCount(SpecificationLimit.none())
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||
.searchSetIdentifier("NONE")
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.subqueries(List.of(new SearchSubquery(
|
||||
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(),
|
||||
|
|
|
@ -431,7 +431,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||
.domainCount(SpecificationLimit.none())
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.domains(new ArrayList<>())
|
||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||
.searchSetIdentifier("NONE")
|
||||
.subqueries(List.of());
|
||||
|
||||
return mutator.apply(builder).build();
|
||||
|
|
|
@ -69,7 +69,7 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule {
|
|||
bind(ProcessHeartbeat.class).toInstance(new FakeProcessHeartbeat());
|
||||
|
||||
IndexSearchSetsService setsServiceMock = Mockito.mock(IndexSearchSetsService.class);
|
||||
when(setsServiceMock.getSearchSetByName(SearchSetIdentifier.NONE)).thenReturn(new SearchSetAny());
|
||||
when(setsServiceMock.getSearchSetByName("NONE")).thenReturn(new SearchSetAny());
|
||||
when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings());
|
||||
bind(IndexSearchSetsService.class).toInstance(setsServiceMock);
|
||||
|
||||
|
|
|
@ -44,9 +44,13 @@ public class QueryBasicInterface {
|
|||
if (queryParam == null) {
|
||||
return renderer.render(new Object());
|
||||
}
|
||||
|
||||
int count = request.queryParams("count") == null ? 10 : Integer.parseInt(request.queryParams("count"));
|
||||
String set = request.queryParams("set") == null ? "" : request.queryParams("set");
|
||||
|
||||
var query = queryFactory.createQuery(new QueryParams(queryParam, new QueryLimits(
|
||||
1, 10, 250, 8192
|
||||
), SearchSetIdentifier.NONE));
|
||||
1, count, 250, 8192
|
||||
), set));
|
||||
|
||||
var rsp = indexClient.query(
|
||||
Context.fromRequest(request),
|
||||
|
|
|
@ -49,7 +49,7 @@ public class QueryFactoryTest {
|
|||
SpecificationLimit.none(),
|
||||
null,
|
||||
new QueryLimits(100, 100, 100, 100),
|
||||
SearchSetIdentifier.BLOGS)).specs;
|
||||
"NONE")).specs;
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Reference in a new issue