(db) Retire the EC_DOMAIN_LINK table

Retire the EC_DOMAIN_LINK table as the data has been migrated off into a file instead.
This commit is contained in:
Viktor Lofgren 2024-02-08 15:52:30 +01:00
parent ef261cbbd7
commit b15f47d80e
6 changed files with 45 additions and 178 deletions

View File

@ -0,0 +1 @@
DROP TABLE EC_DOMAIN_LINK;

View File

@ -1,9 +1,7 @@
package nu.marginalia.linkdb.dlinks;
import com.google.inject.name.Named;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -12,18 +10,17 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
/** DomainLinkDb that delegates to either a FileDomainLinkDb or a SqlDomainLinkDb,
* depending on whether the file exists. This is part of the migration path to
* always using FileDomainLinkDb.
/** DomainLinkDb that delegates a FileDomainLinkDb, but handles the case where the database
* is not yet loaded. This speeds up the startup of the index service, as the database is
* loaded in a separate thread.
*/
public class SelectingDomainLinkDb implements DomainLinkDb {
private final static Logger logger = LoggerFactory.getLogger(SelectingDomainLinkDb.class);
public class DelayingDomainLinkDb implements DomainLinkDb {
private final static Logger logger = LoggerFactory.getLogger(DelayingDomainLinkDb.class);
private volatile DomainLinkDb currentDb;
private final Path filename;
public SelectingDomainLinkDb(@Named("domain-linkdb-file") Path filename,
ServiceConfiguration serviceConfiguration,
HikariDataSource dataSource) {
public DelayingDomainLinkDb(@Named("domain-linkdb-file") Path filename) {
this.filename = filename;
// Load the database in a separate thread, so that the constructor can return
@ -32,12 +29,7 @@ public class SelectingDomainLinkDb implements DomainLinkDb {
Thread.ofPlatform().start(() -> {
try {
if (Files.exists(filename)) {
currentDb = new FileDomainLinkDb(filename);
}
else {
currentDb = new SqlDomainLinkDb(filename, dataSource, serviceConfiguration);
}
currentDb = new FileDomainLinkDb(filename);
logger.info("Loaded linkdb");
} catch (Exception e) {
logger.error("Failed to load linkdb", e);

View File

@ -23,7 +23,9 @@ public class FileDomainLinkDb implements DomainLinkDb {
public FileDomainLinkDb(@Named("domain-linkdb-file") Path filename) throws IOException {
this.filename = filename;
loadInput(filename);
if (Files.exists(filename)) {
loadInput(filename);
}
}
@Override

View File

@ -1,150 +0,0 @@
package nu.marginalia.linkdb.dlinks;
import com.google.inject.name.Named;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.Arrays;
/** DomainLinkDb implementation that goes through the motions of
* being a File-backed DomainLinkDb, but actually uses the legacy SQL database
* for loading the data.
* <p>
* This is part of the migration path to using FileDomainLinkDb.
*/
public class SqlDomainLinkDb implements DomainLinkDb {
private volatile long[] sourceToDest = new long[0];
private volatile long[] destToSource = new long[0];
private static final Logger logger = LoggerFactory.getLogger(SqlDomainLinkDb.class);
private final Path filename;
private final HikariDataSource dataSource;
private final int node;
public SqlDomainLinkDb(@Named("domain-linkdb-file") Path filename,
HikariDataSource dataSource,
ServiceConfiguration configuration)
{
this.filename = filename;
this.dataSource = dataSource;
node = configuration.node();
loadDb();
}
@Override
public void switchInput(Path newFilename) throws IOException {
throw new UnsupportedEncodingException();
}
public void loadDb() {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement(
STR."""
SELECT
SOURCE_DOMAIN_ID,
DEST_DOMAIN_ID
FROM EC_DOMAIN_LINK
INNER JOIN EC_DOMAIN
ON EC_DOMAIN.ID = EC_DOMAIN_LINK.SOURCE_DOMAIN_ID
WHERE NODE_AFFINITY=\{node}
""");
var rs = stmt.executeQuery())
{
TLongArrayList sourceToDest = new TLongArrayList(10_000_000);
TLongArrayList destToSource = new TLongArrayList(10_000_000);
while (rs.next()) {
long source = Integer.toUnsignedLong(rs.getInt(1));
long dest = Integer.toUnsignedLong(rs.getInt(2));
sourceToDest.add((source << 32) | dest);
destToSource.add((dest << 32) | source);
}
sourceToDest.sort();
destToSource.sort();
this.sourceToDest = sourceToDest.toArray();
this.destToSource = destToSource.toArray();
}
catch (Exception ex) {
logger.error("Failed to load linkdb", ex);
}
logger.info("LinkDB loaded, size = {}", sourceToDest.length);
}
@Override
public TIntArrayList findDestinations(int source) {
return findRelated(sourceToDest, source);
}
@Override
public TIntArrayList findSources(int dest) {
return findRelated(destToSource, dest);
}
@Override
public int countDestinations(int source) {
return countRelated(sourceToDest, source);
}
@Override
public int countSources(int dest) {
return countRelated(destToSource, dest);
}
@Override
public void forEach(SourceDestConsumer consumer) {
for (long val : sourceToDest) {
consumer.accept((int) (val >>> 32), (int) (val & 0xFFFF_FFFFL));
}
}
private TIntArrayList findRelated(long[] range, int key) {
long keyLong = Integer.toUnsignedLong(key) << 32;
long nextKeyLong = Integer.toUnsignedLong(key + 1) << 32;
int start = Arrays.binarySearch(range, keyLong);
if (start < 0) {
// Key is not found, get the insertion point
start = -start - 1;
}
TIntArrayList result = new TIntArrayList();
for (int i = start; i < range.length && range[i] < nextKeyLong; i++) {
result.add((int) (range[i] & 0xFFFF_FFFFL));
}
return result;
}
private int countRelated(long[] range, int key) {
long keyLong = Integer.toUnsignedLong(key) << 32;
long nextKeyLong = Integer.toUnsignedLong(key + 1) << 32;
int start = Arrays.binarySearch(range, keyLong);
if (start < 0) {
// Key is not found, get the insertion point
start = -start - 1;
}
int num = 0;
for (int i = start; i < range.length && range[i] < nextKeyLong; i++, num++);
return num;
}
}

View File

@ -38,10 +38,22 @@ public class RankingDomainFetcher {
public void getDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE NODE_AFFINITY>0 GROUP BY EC_DOMAIN.ID";
query = """
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE NODE_AFFINITY>0
GROUP BY EC_DOMAIN.ID
""";
}
else {
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE NODE_AFFINITY>0 GROUP BY EC_DOMAIN.ID";
query = """
SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE NODE_AFFINITY>0
GROUP BY EC_DOMAIN.ID
""";
}
getDomains(query, consumer);
@ -51,10 +63,24 @@ public class RankingDomainFetcher {
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
query = """
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE ((INDEXED>1 AND IS_ALIVE)
OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0))
GROUP BY EC_DOMAIN.ID
""";
}
else {
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
query = """
SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE ((INDEXED>1 AND IS_ALIVE)
OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0))
GROUP BY EC_DOMAIN.ID
""";
}
getDomains(query, consumer);

View File

@ -4,10 +4,8 @@ import com.google.inject.AbstractModule;
import com.google.inject.Provides;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.linkdb.dlinks.DomainLinkDb;
import nu.marginalia.linkdb.dlinks.SelectingDomainLinkDb;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.linkdb.dlinks.DelayingDomainLinkDb;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.IndexLocations;
import org.slf4j.Logger;
@ -29,14 +27,12 @@ public class IndexModule extends AbstractModule {
@Provides
@Singleton
public DomainLinkDb domainLinkDb (
FileStorageService storageService,
HikariDataSource dataSource,
ServiceConfiguration serviceConfiguration
FileStorageService storageService
)
{
Path path = IndexLocations.getLinkdbLivePath(storageService).resolve(DOMAIN_LINKS_FILE_NAME);
return new SelectingDomainLinkDb(path, serviceConfiguration, dataSource);
return new DelayingDomainLinkDb(path);
}
@Provides