Merge pull request #69 from MarginaliaSearch/converter-optimizations

Refactor the DomainProcessor to take advantage of the new crawl data format
This commit is contained in:
Viktor 2024-01-10 09:46:54 +01:00 committed by GitHub
commit fad9575154
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
91 changed files with 2377 additions and 1402 deletions

View File

@ -4,6 +4,28 @@ package actorapi;
option java_package="nu.marginalia.index.api";
option java_multiple_files=true;
service IndexDomainLinksApi {
rpc getAllLinks(Empty) returns (stream RpcDomainIdPairs) {}
rpc getLinksFromDomain(RpcDomainId) returns (RpcDomainIdList) {}
rpc getLinksToDomain(RpcDomainId) returns (RpcDomainIdList) {}
rpc countLinksFromDomain(RpcDomainId) returns (RpcDomainIdCount) {}
rpc countLinksToDomain(RpcDomainId) returns (RpcDomainIdCount) {}
}
message RpcDomainId {
int32 domainId = 1;
}
message RpcDomainIdList {
repeated int32 domainId = 1 [packed=true];
}
message RpcDomainIdCount {
int32 idCount = 1;
}
message RpcDomainIdPairs {
repeated int32 sourceIds = 1 [packed=true];
repeated int32 destIds = 2 [packed=true];
}
service QueryApi {
rpc query(RpcQsQuery) returns (RpcQsResponse) {}
}

View File

@ -20,8 +20,10 @@ dependencies {
implementation libs.bundles.slf4j
implementation libs.roaringbitmap
implementation libs.prometheus
implementation libs.notnull
implementation libs.trove
implementation libs.guice
implementation libs.rxjava
implementation libs.gson

View File

@ -2,24 +2,33 @@ package nu.marginalia.query.client;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import gnu.trove.list.array.TIntArrayList;
import io.grpc.ManagedChannel;
import io.grpc.ManagedChannelBuilder;
import io.prometheus.client.Summary;
import nu.marginalia.client.AbstractDynamicClient;
import nu.marginalia.client.Context;
import nu.marginalia.index.api.Empty;
import nu.marginalia.index.api.IndexDomainLinksApiGrpc;
import nu.marginalia.index.api.QueryApiGrpc;
import nu.marginalia.index.api.RpcDomainId;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.results.SearchResultSet;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.query.QueryProtobufCodec;
import nu.marginalia.query.model.QueryParams;
import nu.marginalia.query.model.QueryResponse;
import nu.marginalia.service.descriptor.ServiceDescriptor;
import nu.marginalia.service.descriptor.ServiceDescriptors;
import nu.marginalia.service.id.ServiceId;
import org.roaringbitmap.PeekableCharIterator;
import org.roaringbitmap.longlong.PeekableLongIterator;
import org.roaringbitmap.longlong.Roaring64Bitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.CheckReturnValue;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
@ -36,13 +45,15 @@ public class QueryClient extends AbstractDynamicClient {
.register();
private final Map<ServiceAndNode, ManagedChannel> channels = new ConcurrentHashMap<>();
private final Map<ServiceAndNode, QueryApiGrpc.QueryApiBlockingStub > queryApis = new ConcurrentHashMap<>();
private final Map<ServiceAndNode, QueryApiGrpc.QueryApiBlockingStub > queryIndexApis = new ConcurrentHashMap<>();
private final Map<ServiceAndNode, IndexDomainLinksApiGrpc.IndexDomainLinksApiBlockingStub> domainLinkApis = new ConcurrentHashMap<>();
record ServiceAndNode(String service, int node) {
public String getHostName() {
return service;
}
}
private ManagedChannel getChannel(ServiceAndNode serviceAndNode) {
return channels.computeIfAbsent(serviceAndNode,
san -> ManagedChannelBuilder
@ -52,13 +63,21 @@ public class QueryClient extends AbstractDynamicClient {
}
public QueryApiGrpc.QueryApiBlockingStub queryApi(int node) {
return queryApis.computeIfAbsent(new ServiceAndNode("query-service", node), n ->
return queryIndexApis.computeIfAbsent(new ServiceAndNode("query-service", node), n ->
QueryApiGrpc.newBlockingStub(
getChannel(n)
)
);
}
public IndexDomainLinksApiGrpc.IndexDomainLinksApiBlockingStub domainApi(int node) {
return domainLinkApis.computeIfAbsent(new ServiceAndNode("query-service", node), n ->
IndexDomainLinksApiGrpc.newBlockingStub(
getChannel(n)
)
);
}
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
@ -66,6 +85,9 @@ public class QueryClient extends AbstractDynamicClient {
super(descriptors.forId(ServiceId.Query), GsonFactory::get);
}
public QueryClient() {
super(new ServiceDescriptor(ServiceId.Query, "query-service"), GsonFactory::get);
}
/** Delegate an Index API style query directly to the index service */
@CheckReturnValue
@ -82,4 +104,101 @@ public class QueryClient extends AbstractDynamicClient {
);
}
public AllLinks getAllDomainLinks() {
AllLinks links = new AllLinks();
domainApi(0).getAllLinks(Empty.newBuilder().build()).forEachRemaining(pairs -> {
for (int i = 0; i < pairs.getDestIdsCount(); i++) {
links.add(pairs.getSourceIds(i), pairs.getDestIds(i));
}
});
return links;
}
public List<Integer> getLinksToDomain(int domainId) {
try {
return domainApi(0).getLinksToDomain(RpcDomainId
.newBuilder()
.setDomainId(domainId)
.build())
.getDomainIdList();
}
catch (Exception e) {
logger.error("API Exception", e);
return List.of();
}
}
public List<Integer> getLinksFromDomain(int domainId) {
try {
return domainApi(0).getLinksFromDomain(RpcDomainId
.newBuilder()
.setDomainId(domainId)
.build())
.getDomainIdList();
}
catch (Exception e) {
logger.error("API Exception", e);
return List.of();
}
}
public int countLinksToDomain(int domainId) {
try {
return domainApi(0).countLinksToDomain(RpcDomainId
.newBuilder()
.setDomainId(domainId)
.build())
.getIdCount();
}
catch (Exception e) {
logger.error("API Exception", e);
return 0;
}
}
public int countLinksFromDomain(int domainId) {
try {
return domainApi(0).countLinksFromDomain(RpcDomainId
.newBuilder()
.setDomainId(domainId)
.build())
.getIdCount();
}
catch (Exception e) {
logger.error("API Exception", e);
return 0;
}
}
public static class AllLinks {
private final Roaring64Bitmap sourceToDest = new Roaring64Bitmap();
public void add(int source, int dest) {
sourceToDest.add(Integer.toUnsignedLong(source) << 32 | Integer.toUnsignedLong(dest));
}
public Iterator iterator() {
return new Iterator();
}
public class Iterator {
private final PeekableLongIterator base = sourceToDest.getLongIterator();
long val = Long.MIN_VALUE;
public boolean advance() {
if (base.hasNext()) {
val = base.next();
return true;
}
return false;
}
public int source() {
return (int) (val >>> 32);
}
public int dest() {
return (int) (val & 0xFFFF_FFFFL);
}
}
}
}

View File

@ -16,6 +16,7 @@ configurations {
dependencies {
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation libs.bundles.slf4j
@ -23,6 +24,7 @@ dependencies {
implementation libs.bundles.gson
implementation libs.notnull
implementation libs.bundles.mariadb
implementation libs.sqlite
implementation libs.commons.lang3

View File

@ -1,11 +1,30 @@
The link database contains information about links,
## Domain Link Database
The domain link database contains information about links
between domains. It is a static in-memory database loaded
from a binary file.
* [DomainLinkDb](src/main/java/nu/marginalia/linkdb/DomainLinkDb.java)
* * [FileDomainLinkDb](src/main/java/nu/marginalia/linkdb/FileDomainLinkDb.java)
* * [SqlDomainLinkDb](src/main/java/nu/marginalia/linkdb/SqlDomainLinkDb.java)
* [DomainLinkDbWriter](src/main/java/nu/marginalia/linkdb/DomainLinkDbWriter.java)
* [DomainLinkDbLoader](src/main/java/nu/marginalia/linkdb/DomainLinkDbLoader.java)
## Document Database
The document database contains information about links,
such as their ID, their URL, their title, their description,
and so forth.
The link database is a sqlite file. The reason this information
The document database is a sqlite file. The reason this information
is not in the MariaDB database is that this would make updates to
this information take effect in production immediately, even before
the information was searchable.
It is constructed by the [loading-process](../../processes/loading-process), and consumed
by the [index-service](../../services-core/index-service).
* [DocumentLinkDbWriter](src/main/java/nu/marginalia/linkdb/DocumentDbWriter.java)
* [DocumentLinkDbLoader](src/main/java/nu/marginalia/linkdb/DocumentDbReader.java)
## See Also
These databases are constructed by the [loading-process](../../processes/loading-process), and consumed by the [index-service](../../services-core/index-service).

View File

@ -0,0 +1,7 @@
package nu.marginalia.linkdb;
public class LinkdbFileNames {
public static String DEPRECATED_LINKDB_FILE_NAME = "links.db";
public static String DOCDB_FILE_NAME = "documents.db";
public static String DOMAIN_LINKS_FILE_NAME = "domain-links.dat";
}

View File

@ -0,0 +1,39 @@
package nu.marginalia.linkdb.dlinks;
import gnu.trove.list.array.TIntArrayList;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Arrays;
/** A database of source-destination pairs of domain IDs. The database is loaded into memory from
* a source. The database is then kept in memory, reloading it upon switchInput().
*/
public interface DomainLinkDb {
/** Replace the current db file with the provided file. The provided file will be deleted.
* The in-memory database MAY be updated to reflect the change.
* */
void switchInput(Path filename) throws Exception;
/** Find all destinations for the given source. */
TIntArrayList findDestinations(int source);
/** Count the number of destinations for the given source. */
int countDestinations(int source);
/** Find all sources for the given destination. */
TIntArrayList findSources(int dest);
/** Count the number of sources for the given destination. */
int countSources(int source);
/** Iterate over all source-destination pairs. */
void forEach(SourceDestConsumer consumer);
interface SourceDestConsumer {
void accept(int source, int dest);
}
}

View File

@ -0,0 +1,45 @@
package nu.marginalia.linkdb.dlinks;
import java.io.DataInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class DomainLinkDbLoader implements AutoCloseable {
private final DataInputStream stream;
private final Path filename;
private long nextVal;
public DomainLinkDbLoader(Path filename) throws IOException {
this.stream = new DataInputStream(Files.newInputStream(filename));
this.filename = filename;
}
public int size() throws IOException {
return (int) (Files.size(filename) / 8);
}
public boolean next() {
try {
nextVal = stream.readLong();
return true;
}
catch (IOException ex) {
return false;
}
}
public int getSource() {
return (int) (nextVal >>> 32);
}
public int getDest() {
return (int) (nextVal & 0xffff_ffffL);
}
public void close() throws IOException {
stream.close();
}
}

View File

@ -0,0 +1,29 @@
package nu.marginalia.linkdb.dlinks;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class DomainLinkDbWriter implements AutoCloseable {
private final DataOutputStream stream;
public DomainLinkDbWriter(Path fileName) throws IOException {
this.stream = new DataOutputStream(Files.newOutputStream(fileName,
StandardOpenOption.CREATE,
StandardOpenOption.WRITE,
StandardOpenOption.TRUNCATE_EXISTING)
);
}
public void write(int sourceDomainId, int destDomainId) throws IOException {
stream.writeLong(Integer.toUnsignedLong(sourceDomainId) << 32
| Integer.toUnsignedLong(destDomainId));
}
@Override
public void close() throws IOException {
stream.close();
}
}

View File

@ -0,0 +1,124 @@
package nu.marginalia.linkdb.dlinks;
import com.google.inject.name.Named;
import gnu.trove.list.array.TIntArrayList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.Arrays;
/** Canonical DomainLinkDb implementation. The database is loaded into memory from
* a file. The database is then kept in memory, reloading it upon switchInput().
*/
public class FileDomainLinkDb implements DomainLinkDb {
private static final Logger logger = LoggerFactory.getLogger(FileDomainLinkDb.class);
private final Path filename;
private volatile long[] sourceToDest = new long[0];
private volatile long[] destToSource = new long[0];
public FileDomainLinkDb(@Named("domain-linkdb-file") Path filename) throws IOException {
this.filename = filename;
loadInput(filename);
}
@Override
public void switchInput(Path newFilename) throws IOException {
Files.move(newFilename, filename, StandardCopyOption.REPLACE_EXISTING);
loadInput(filename);
}
public void loadInput(Path filename) throws IOException {
try (var loader = new DomainLinkDbLoader(filename)) {
int size = loader.size();
var newSourceToDest = new long[size];
var newDestToSource = new long[size];
int i = 0;
while (loader.next()) {
long source = loader.getSource();
long dest = loader.getDest();
newSourceToDest[i] = (source << 32) | dest;
newDestToSource[i] = (dest << 32) | source;
i++;
}
Arrays.sort(newSourceToDest);
Arrays.sort(newDestToSource);
sourceToDest = newSourceToDest;
destToSource = newDestToSource;
}
}
@Override
public TIntArrayList findDestinations(int source) {
return findRelated(sourceToDest, source);
}
@Override
public TIntArrayList findSources(int dest) {
return findRelated(destToSource, dest);
}
@Override
public int countDestinations(int source) {
return countRelated(sourceToDest, source);
}
@Override
public int countSources(int dest) {
return countRelated(destToSource, dest);
}
@Override
public void forEach(SourceDestConsumer consumer) {
for (long val : sourceToDest) {
consumer.accept((int) (val >>> 32), (int) (val & 0xFFFF_FFFFL));
}
}
private TIntArrayList findRelated(long[] range, int key) {
long keyLong = Integer.toUnsignedLong(key) << 32;
long nextKeyLong = Integer.toUnsignedLong(key + 1) << 32;
int start = Arrays.binarySearch(range, keyLong);
if (start < 0) {
// Key is not found, get the insertion point
start = -start - 1;
}
TIntArrayList result = new TIntArrayList();
for (int i = start; i < range.length && range[i] < nextKeyLong; i++) {
result.add((int) (range[i] & 0xFFFF_FFFFL));
}
return result;
}
private int countRelated(long[] range, int key) {
long keyLong = Integer.toUnsignedLong(key) << 32;
long nextKeyLong = Integer.toUnsignedLong(key + 1) << 32;
int start = Arrays.binarySearch(range, keyLong);
if (start < 0) {
// Key is not found, get the insertion point
start = -start - 1;
}
int num = 0;
for (int i = start; i < range.length && range[i] < nextKeyLong; i++, num++);
return num;
}
}

View File

@ -0,0 +1,104 @@
package nu.marginalia.linkdb.dlinks;
import com.google.inject.name.Named;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
/** DomainLinkDb that delegates to either a FileDomainLinkDb or a SqlDomainLinkDb,
* depending on whether the file exists. This is part of the migration path to
* always using FileDomainLinkDb.
*/
public class SelectingDomainLinkDb implements DomainLinkDb {
private final static Logger logger = LoggerFactory.getLogger(SelectingDomainLinkDb.class);
private volatile DomainLinkDb currentDb;
private final Path filename;
public SelectingDomainLinkDb(@Named("domain-linkdb-file") Path filename,
ServiceConfiguration serviceConfiguration,
HikariDataSource dataSource) {
this.filename = filename;
// Load the database in a separate thread, so that the constructor can return
// immediately. This would otherwise add a lot of time to the startup of the
// index service.
Thread.ofPlatform().start(() -> {
try {
if (Files.exists(filename)) {
currentDb = new FileDomainLinkDb(filename);
}
else {
currentDb = new SqlDomainLinkDb(filename, dataSource, serviceConfiguration);
}
logger.info("Loaded linkdb");
} catch (Exception e) {
logger.error("Failed to load linkdb", e);
}
});
}
@Override
public void switchInput(Path newFilename) throws Exception {
Files.move(newFilename, filename, StandardCopyOption.REPLACE_EXISTING);
Thread.ofPlatform().start(() -> {
try {
currentDb = new FileDomainLinkDb(filename);
} catch (IOException e) {
logger.error("Failed to load linkdb", e);
}
});
}
@Override
public TIntArrayList findDestinations(int source) {
// A race condition is not possible here, as the nullity of currentDb only changes from
// null to non-null
if (currentDb == null)
return new TIntArrayList();
return currentDb.findDestinations(source);
}
@Override
public int countDestinations(int source) {
if (currentDb == null)
return 0;
return currentDb.countDestinations(source);
}
@Override
public TIntArrayList findSources(int dest) {
if (currentDb == null)
return new TIntArrayList();
return currentDb.findSources(dest);
}
@Override
public int countSources(int source) {
if (currentDb == null)
return 0;
return currentDb.countSources(source);
}
@Override
public void forEach(SourceDestConsumer consumer) {
if (currentDb == null)
throw new IllegalStateException("No linkdb loaded");
currentDb.forEach(consumer);
}
}

View File

@ -0,0 +1,150 @@
package nu.marginalia.linkdb.dlinks;
import com.google.inject.name.Named;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.Arrays;
/** DomainLinkDb implementation that goes through the motions of
* being a File-backed DomainLinkDb, but actually uses the legacy SQL database
* for loading the data.
* <p>
* This is part of the migration path to using FileDomainLinkDb.
*/
public class SqlDomainLinkDb implements DomainLinkDb {
private volatile long[] sourceToDest = new long[0];
private volatile long[] destToSource = new long[0];
private static final Logger logger = LoggerFactory.getLogger(SqlDomainLinkDb.class);
private final Path filename;
private final HikariDataSource dataSource;
private final int node;
public SqlDomainLinkDb(@Named("domain-linkdb-file") Path filename,
HikariDataSource dataSource,
ServiceConfiguration configuration)
{
this.filename = filename;
this.dataSource = dataSource;
node = configuration.node();
loadDb();
}
@Override
public void switchInput(Path newFilename) throws IOException {
throw new UnsupportedEncodingException();
}
public void loadDb() {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement(
STR."""
SELECT
SOURCE_DOMAIN_ID,
DEST_DOMAIN_ID
FROM EC_DOMAIN_LINK
INNER JOIN EC_DOMAIN
ON EC_DOMAIN.ID = EC_DOMAIN_LINK.SOURCE_DOMAIN_ID
WHERE NODE_AFFINITY=\{node}
""");
var rs = stmt.executeQuery())
{
TLongArrayList sourceToDest = new TLongArrayList(10_000_000);
TLongArrayList destToSource = new TLongArrayList(10_000_000);
while (rs.next()) {
long source = Integer.toUnsignedLong(rs.getInt(1));
long dest = Integer.toUnsignedLong(rs.getInt(2));
sourceToDest.add((source << 32) | dest);
destToSource.add((dest << 32) | source);
}
sourceToDest.sort();
destToSource.sort();
this.sourceToDest = sourceToDest.toArray();
this.destToSource = destToSource.toArray();
}
catch (Exception ex) {
logger.error("Failed to load linkdb", ex);
}
logger.info("LinkDB loaded, size = {}", sourceToDest.length);
}
@Override
public TIntArrayList findDestinations(int source) {
return findRelated(sourceToDest, source);
}
@Override
public TIntArrayList findSources(int dest) {
return findRelated(destToSource, dest);
}
@Override
public int countDestinations(int source) {
return countRelated(sourceToDest, source);
}
@Override
public int countSources(int dest) {
return countRelated(destToSource, dest);
}
@Override
public void forEach(SourceDestConsumer consumer) {
for (long val : sourceToDest) {
consumer.accept((int) (val >>> 32), (int) (val & 0xFFFF_FFFFL));
}
}
private TIntArrayList findRelated(long[] range, int key) {
long keyLong = Integer.toUnsignedLong(key) << 32;
long nextKeyLong = Integer.toUnsignedLong(key + 1) << 32;
int start = Arrays.binarySearch(range, keyLong);
if (start < 0) {
// Key is not found, get the insertion point
start = -start - 1;
}
TIntArrayList result = new TIntArrayList();
for (int i = start; i < range.length && range[i] < nextKeyLong; i++) {
result.add((int) (range[i] & 0xFFFF_FFFFL));
}
return result;
}
private int countRelated(long[] range, int key) {
long keyLong = Integer.toUnsignedLong(key) << 32;
long nextKeyLong = Integer.toUnsignedLong(key + 1) << 32;
int start = Arrays.binarySearch(range, keyLong);
if (start < 0) {
// Key is not found, get the insertion point
start = -start - 1;
}
int num = 0;
for (int i = start; i < range.length && range[i] < nextKeyLong; i++, num++);
return num;
}
}

View File

@ -1,10 +1,10 @@
package nu.marginalia.linkdb;
package nu.marginalia.linkdb.docs;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import gnu.trove.list.TLongList;
import nu.marginalia.linkdb.model.LdbUrlDetail;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
@ -23,21 +23,21 @@ import java.util.ArrayList;
import java.util.List;
@Singleton
public class LinkdbReader {
public class DocumentDbReader {
private final Path dbFile;
private volatile Connection connection;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public LinkdbReader(@Named("linkdb-file") Path dbFile) throws SQLException {
public DocumentDbReader(@Named("docdb-file") Path dbFile) throws SQLException {
this.dbFile = dbFile;
if (Files.exists(dbFile)) {
connection = createConnection();
}
else {
logger.warn("No linkdb file {}", dbFile);
logger.warn("No docdb file {}", dbFile);
}
}
@ -107,8 +107,8 @@ public class LinkdbReader {
return ret;
}
public List<LdbUrlDetail> getUrlDetails(TLongList ids) throws SQLException {
List<LdbUrlDetail> ret = new ArrayList<>(ids.size());
public List<DocdbUrlDetail> getUrlDetails(TLongList ids) throws SQLException {
List<DocdbUrlDetail> ret = new ArrayList<>(ids.size());
if (connection == null ||
connection.isClosed())
@ -126,7 +126,7 @@ public class LinkdbReader {
var rs = stmt.executeQuery();
if (rs.next()) {
var url = new EdgeUrl(rs.getString("URL"));
ret.add(new LdbUrlDetail(
ret.add(new DocdbUrlDetail(
rs.getLong("ID"),
url,
rs.getString("TITLE"),

View File

@ -1,24 +1,23 @@
package nu.marginalia.linkdb;
package nu.marginalia.linkdb.docs;
import nu.marginalia.linkdb.model.LdbUrlDetail;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Types;
import java.util.List;
public class LinkdbWriter {
public class DocumentDbWriter {
private final Connection connection;
public LinkdbWriter(Path outputFile) throws SQLException {
public DocumentDbWriter(Path outputFile) throws SQLException {
String connStr = "jdbc:sqlite:" + outputFile.toString();
connection = DriverManager.getConnection(connStr);
try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb-document.sql");
try (var stream = ClassLoader.getSystemResourceAsStream("db/docdb-document.sql");
var stmt = connection.createStatement()
) {
var sql = new String(stream.readAllBytes());
@ -31,11 +30,11 @@ public class LinkdbWriter {
}
}
public void add(LdbUrlDetail ldbUrlDetail) throws SQLException {
add(List.of(ldbUrlDetail));
public void add(DocdbUrlDetail docdbUrlDetail) throws SQLException {
add(List.of(docdbUrlDetail));
}
public void add(List<LdbUrlDetail> ldbUrlDetail) throws SQLException {
public void add(List<DocdbUrlDetail> docdbUrlDetail) throws SQLException {
try (var stmt = connection.prepareStatement("""
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
@ -43,7 +42,7 @@ public class LinkdbWriter {
""")) {
int i = 0;
for (var document : ldbUrlDetail) {
for (var document : docdbUrlDetail) {
var url = document.url();
stmt.setLong(1, document.urlId());

View File

@ -0,0 +1,18 @@
package nu.marginalia.linkdb.model;
import nu.marginalia.model.EdgeUrl;
public record DocdbUrlDetail(long urlId,
EdgeUrl url,
String title,
String description,
double urlQuality,
String format,
int features,
Integer pubYear,
long dataHash,
int wordsTotal
)
{
}

View File

@ -1,18 +0,0 @@
package nu.marginalia.linkdb.model;
import nu.marginalia.model.EdgeUrl;
public record LdbUrlDetail(long urlId,
EdgeUrl url,
String title,
String description,
double urlQuality,
String format,
int features,
Integer pubYear,
long dataHash,
int wordsTotal
)
{
}

View File

@ -1,7 +1,9 @@
package nu.marginalia.linkdb;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.linkdb.model.LdbUrlDetail;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.EdgeDomain;
import org.junit.jupiter.api.Test;
@ -10,13 +12,13 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
public class LinkdbWriterTest {
public class DocumentDbWriterTest {
@Test
public void testCreate() throws IOException {
Path tempPath = Files.createTempFile("linkdb", ".db");
Path tempPath = Files.createTempFile("docdb", ".db");
try {
var writer = new LinkdbWriter(tempPath);
writer.add(new LdbUrlDetail(
var writer = new DocumentDbWriter(tempPath);
writer.add(new DocdbUrlDetail(
1,
new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null),
"Test",
@ -30,7 +32,7 @@ public class LinkdbWriterTest {
));
writer.close();
var reader = new LinkdbReader(tempPath);
var reader = new DocumentDbReader(tempPath);
var deets = reader.getUrlDetails(new TLongArrayList(new long[]{1}));
System.out.println(deets);
} catch (SQLException e) {

View File

@ -0,0 +1,52 @@
package nu.marginalia.linkdb;
import nu.marginalia.linkdb.dlinks.DomainLinkDbLoader;
import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class DomainLinkDbTest {
Path fileName;
@BeforeEach
public void setUp() throws IOException {
fileName = Files.createTempFile("test", ".db");
}
@AfterEach
public void tearDown() throws IOException {
Files.deleteIfExists(fileName);
}
@Test
public void testWriteRead() {
try (var writer = new DomainLinkDbWriter(fileName)) {
writer.write(1, 2);
writer.write(2, 3);
writer.write(3, 4);
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
try (var reader = new DomainLinkDbLoader(fileName)) {
Assertions.assertTrue(reader.next());
Assertions.assertEquals(1, reader.getSource());
Assertions.assertEquals(2, reader.getDest());
Assertions.assertTrue(reader.next());
Assertions.assertEquals(2, reader.getSource());
Assertions.assertEquals(3, reader.getDest());
Assertions.assertTrue(reader.next());
Assertions.assertEquals(3, reader.getSource());
Assertions.assertEquals(4, reader.getDest());
Assertions.assertFalse(reader.next());
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
}

View File

@ -1,12 +1,18 @@
package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets;
import lombok.SneakyThrows;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
@ -50,4 +56,39 @@ class NameLikeKeywordsTest {
assertEquals(Collections.emptySet(), Sets.symmetricDifference(actual, expected));
}
@Test
@SneakyThrows
public void testWikiArticle() {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/java.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var ke = new KeywordExtractor();
var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2);
System.out.println("Names: " + nameWords.words());
}
@Test
@SneakyThrows
public void testWikiArticleP1() {
String html = """
<p><b>Java</b> is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers <i>write once, run anywhere</i> (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for clientserver web applications, with a reported 9 million developers.</p>
<p>Java was originally developed by James Gosling at Sun Microsystems. It was released in May 1995 as a core component of Sun Microsystems' Java platform. The original and reference implementation Java compilers, virtual machines, and class libraries were originally released by Sun under proprietary licenses. As of May 2007, in compliance with the specifications of the Java Community Process, Sun had relicensed most of its Java technologies under the GPL-2.0-only license. Oracle offers its own HotSpot Java Virtual Machine, however the official reference implementation is the OpenJDK JVM which is free open-source software and used by most developers and is the default JVM for almost all Linux distributions.</p>
<p>As of September 2023 , Java 21 is the latest version, while Java 17, 11 and 8 are the current long-term support (LTS) versions.</p>""";
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var ke = new KeywordExtractor();
var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2);
System.out.println("Names: " + nameWords.words());
}
}

View File

@ -0,0 +1,348 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Java (programming language)</title>
<link rel="stylesheet" href="/style.css">
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<article class="article">
<section id="body">
<h1>Java (programming language)</h1>
<div>
<section data-mw-section-id="0">
<p><b>Java</b> is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers <i>write once, run anywhere</i> (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for clientserver web applications, with a reported 9 million developers.</p>
<p>Java was originally developed by James Gosling at Sun Microsystems. It was released in May 1995 as a core component of Sun Microsystems' Java platform. The original and reference implementation Java compilers, virtual machines, and class libraries were originally released by Sun under proprietary licenses. As of May 2007, in compliance with the specifications of the Java Community Process, Sun had relicensed most of its Java technologies under the GPL-2.0-only license. Oracle offers its own HotSpot Java Virtual Machine, however the official reference implementation is the OpenJDK JVM which is free open-source software and used by most developers and is the default JVM for almost all Linux distributions.</p>
<p>As of September 2023 , Java 21 is the latest version, while Java 17, 11 and 8 are the current long-term support (LTS) versions.</p>
</section>
<section data-mw-section-id="1">
<h2>History</h2>
<p>James Gosling, Mike Sheridan, and Patrick Naughton initiated the Java language project in June 1991. Java was originally designed for interactive television, but it was too advanced for the digital cable television industry at the time. The language was initially called <i>Oak</i> after an oak tree that stood outside Gosling's office. Later the project went by the name <i>Green</i> and was finally renamed <i>Java</i>, from Java coffee, a type of coffee from Indonesia. Gosling designed Java with a C/C++-style syntax that system and application programmers would find familiar.</p>
<p>Sun Microsystems released the first public implementation as Java 1.0 in 1996. It promised write once, run anywhere (WORA) functionality, providing no-cost run-times on popular platforms. Fairly secure and featuring configurable security, it allowed network- and file-access restrictions. Major web browsers soon incorporated the ability to run Java applets within web pages, and Java quickly became popular. The Java 1.0 compiler was re-written in Java by Arthur van Hoff to comply strictly with the Java 1.0 language specification. With the advent of Java 2 (released initially as J2SE 1.2 in December 1998 1999), new versions had multiple configurations built for different types of platforms. J2EE included technologies and APIs for enterprise applications typically run in server environments, while J2ME featured APIs optimized for mobile applications. The desktop version was renamed J2SE. In 2006, for marketing purposes, Sun renamed new J2 versions as <i>Java EE</i>, <i>Java ME</i>, and <i>Java SE</i>, respectively.</p>
<p>In 1997, Sun Microsystems approached the ISO/IEC JTC 1 standards body and later the Ecma International to formalize Java, but it soon withdrew from the process. Java remains a de facto standard, controlled through the Java Community Process. At one time, Sun made most of its Java implementations available without charge, despite their proprietary software status. Sun generated revenue from Java through the selling of licenses for specialized products such as the Java Enterprise System.</p>
<p>On November 13, 2006, Sun released much of its Java virtual machine (JVM) as free and open-source software (FOSS), under the terms of the GPL-2.0-only license. On May 8, 2007, Sun finished the process, making all of its JVM's core code available under free software/open-source distribution terms, aside from a small portion of code to which Sun did not hold the copyright.</p>
<p>Sun's vice-president Rich Green said that Sun's ideal role with regard to Java was as an <i>evangelist</i>. Following Oracle Corporation's acquisition of Sun Microsystems in 200910, Oracle has described itself as the steward of Java technology with a relentless commitment to fostering a community of participation and transparency. This did not prevent Oracle from filing a lawsuit against Google shortly after that for using Java inside the Android SDK (see the <i>Android</i> section).</p>
<p>On April 2, 2010, James Gosling resigned from Oracle.</p>
<p>In January 2016, Oracle announced that Java run-time environments based on JDK 9 will discontinue the browser plugin.</p>
<p>Java software runs on everything from laptops to data centers, game consoles to scientific supercomputers.</p>
<p>Oracle (and others) highly recommend uninstalling outdated and unsupported versions of Java, due to unresolved security issues in older versions.</p>
<section data-mw-section-id="2">
<h3>Principles</h3>
<p>There were five primary goals in the creation of the Java language:</p>
<blockquote>
<ol>
<li>It must be simple, <a href="Object-oriented" class="mw-redirect">object-oriented</a>, and familiar.</li>
<li>It must be <a href="Robustness_(computer_science)">robust</a> and secure.</li>
<li>It must be architecture-neutral and portable.</li>
<li>It must execute with high performance.</li>
<li>It must be <a href="Interpreted_language" class="mw-redirect">interpreted</a>, <a href="Thread_(computing)">threaded</a>, and <a href="Dynamic_programming_language">dynamic</a>.</li>
</ol>
</blockquote>
</section>
<section data-mw-section-id="3">
<h3>Versions</h3>
<p>As of September 2023 , Java 8, 11, 17 and 21 are supported as Long-Term Support (LTS) versions.</p>
<p>Oracle released the last zero-cost public update for the legacy version Java 8 LTS in January 2019 for commercial use, although it will otherwise still support Java 8 with public updates for personal use indefinitely. Other vendors have begun to offer zero-cost builds of OpenJDK 18 and 8, 11 and 17 that are still receiving security and other upgrades.</p>
<p>Major release versions of Java, along with their release dates:</p>
</section>
</section>
<section data-mw-section-id="4">
<h2>Editions</h2>
<p>Sun has defined and supports four editions of Java targeting different application environments and segmented many of its APIs so that they belong to one of the platforms. The platforms are:</p>
<ul>
<li><a href="Java_Card">Java Card</a> for smart-cards.</li>
<li><a href="Java_Platform%2C_Micro_Edition">Java Platform, Micro Edition</a> (Java ME) targeting environments with limited resources.</li>
<li><a href="Java_Platform%2C_Standard_Edition">Java Platform, Standard Edition</a> (Java SE) targeting workstation environments.</li>
<li><a href="Java_Platform%2C_Enterprise_Edition" class="mw-redirect">Java Platform, Enterprise Edition</a> (Java EE) targeting large distributed enterprise or Internet environments.</li>
</ul>
<p>The classes in the Java APIs are organized into separate groups called packages. Each package contains a set of related interfaces, classes, subpackages and exceptions.</p>
<p>Sun also provided an edition called Personal Java that has been superseded by later, standards-based Java ME configuration-profile pairings.</p>
</section>
<section data-mw-section-id="5">
<h2>Execution system</h2>
<section data-mw-section-id="6">
<h3>Java JVM and bytecode</h3>
<p>One design goal of Java is portability, which means that programs written for the Java platform must run similarly on any combination of hardware and operating system with adequate run time support. This is achieved by compiling the Java language code to an intermediate representation called Java bytecode, instead of directly to architecture-specific machine code. Java bytecode instructions are analogous to machine code, but they are intended to be executed by a virtual machine (VM) written specifically for the host hardware. End-users commonly use a Java Runtime Environment (JRE) installed on their device for standalone Java applications or a web browser for Java applets.</p>
<p>Standard libraries provide a generic way to access host-specific features such as graphics, threading, and networking.</p>
<p>The use of universal bytecode makes porting simple. However, the overhead of interpreting bytecode into machine instructions made interpreted programs almost always run more slowly than native executables. Just-in-time (JIT) compilers that compile byte-codes to machine code during runtime were introduced from an early stage. Java's Hotspot compiler is actually two compilers in one; and with GraalVM (included in e.g. Java 11, but removed as of Java 16) allowing tiered compilation. Java itself is platform-independent and is adapted to the particular platform it is to run on by a Java virtual machine (JVM), which translates the Java bytecode into the platform's machine language.</p>
<section data-mw-section-id="7">
<h4>Performance</h4>
<p>Programs written in Java have a reputation for being slower and requiring more memory than those written in C++. However, Java programs' execution speed improved significantly with the introduction of just-in-time compilation in 1997/1998 for Java 1.1, the addition of language features supporting better code analysis (such as inner classes, the StringBuilder class, optional assertions, etc.), and optimizations in the Java virtual machine, such as HotSpot becoming Sun's default JVM in 2000. With Java 1.5, the performance was improved with the addition of the <code class="mw-highlight mw-highlight-lang-text mw-content-ltr">java.util.concurrent</code> package, including lock-free implementations of the ConcurrentMaps and other multi-core collections, and it was improved further with Java 1.6.</p>
</section>
</section>
<section data-mw-section-id="8">
<h3>Non-JVM</h3>
<p>Some platforms offer direct hardware support for Java; there are micro controllers that can run Java bytecode in hardware instead of a software Java virtual machine, and some ARM-based processors could have hardware support for executing Java bytecode through their Jazelle option, though support has mostly been dropped in current implementations of ARM.</p>
</section>
<section data-mw-section-id="9">
<h3>Automatic memory management</h3>
<p>Java uses an automatic garbage collector to manage memory in the object lifecycle. The programmer determines when objects are created, and the Java runtime is responsible for recovering the memory once objects are no longer in use. Once no references to an object remain, the unreachable memory becomes eligible to be freed automatically by the garbage collector. Something similar to a memory leak may still occur if a programmer's code holds a reference to an object that is no longer needed, typically when objects that are no longer needed are stored in containers that are still in use. If methods for a non-existent object are called, a null pointer exception is thrown.</p>
<p>One of the ideas behind Java's automatic memory management model is that programmers can be spared the burden of having to perform manual memory management. In some languages, memory for the creation of objects is implicitly allocated on the stack or explicitly allocated and deallocated from the heap. In the latter case, the responsibility of managing memory resides with the programmer. If the program does not deallocate an object, a memory leak occurs. If the program attempts to access or deallocate memory that has already been deallocated, the result is undefined and difficult to predict, and the program is likely to become unstable or crash. This can be partially remedied by the use of smart pointers, but these add overhead and complexity. Garbage collection does not prevent logical memory leaks, i.e. those where the memory is still referenced but never used.</p>
<p>Garbage collection may happen at any time. Ideally, it will occur when a program is idle. It is guaranteed to be triggered if there is insufficient free memory on the heap to allocate a new object; this can cause a program to stall momentarily. Explicit memory management is not possible in Java.</p>
<p>Java does not support C/C++ style pointer arithmetic, where object addresses can be arithmetically manipulated (e.g. by adding or subtracting an offset). This allows the garbage collector to relocate referenced objects and ensures type safety and security.</p>
<p>As in C++ and some other object-oriented languages, variables of Java's primitive data types are either stored directly in fields (for objects) or on the stack (for methods) rather than on the heap, as is commonly true for non-primitive data types (but see escape analysis). This was a conscious decision by Java's designers for performance reasons.</p>
<p>Java contains multiple types of garbage collectors. Since Java 9, HotSpot uses the Garbage First Garbage Collector (G1GC) as the default. However, there are also several other garbage collectors that can be used to manage the heap. For most applications in Java, G1GC is sufficient. Previously, the Parallel Garbage Collector was used in Java 8.</p>
<p>Having solved the memory management problem does not relieve the programmer of the burden of handling properly other kinds of resources, like network or database connections, file handles, etc., especially in the presence of exceptions.</p>
</section>
</section>
<section data-mw-section-id="10">
<h2>Syntax</h2>
<p>The syntax of Java is largely influenced by C++ and C. Unlike C++, which combines the syntax for structured, generic, and object-oriented programming, Java was built almost exclusively as an object-oriented language. All code is written inside classes, and every data item is an object, with the exception of the primitive data types, (i.e. integers, floating-point numbers, boolean values, and characters), which are not objects for performance reasons. Java reuses some popular aspects of C++ (such as the <code class="mw-highlight mw-highlight-lang-java mw-content-ltr"> printf </code> method).</p>
<p>Unlike C++, Java does not support operator overloading or multiple inheritance for classes, though multiple inheritance is supported for interfaces.</p>
<p>Java uses comments similar to those of C++. There are three different styles of comments: a single line style marked with two slashes (<code>//</code>), a multiple line style opened with <code>/*</code> and closed with <code>*/</code>, and the Javadoc commenting style opened with <code>/**</code> and closed with <code>*/</code>. The Javadoc style of commenting allows the user to run the Javadoc executable to create documentation for the program and can be read by some integrated development environments (IDEs) such as Eclipse to allow developers to access documentation within the IDE.</p>
<section data-mw-section-id="11">
<h3>Hello world example</h3>
<p>The traditional Hello world program can be written in Java as:</p>
<div class="mw-highlight mw-highlight-lang-java mw-content-ltr mw-highlight-lines">
<pre>publicclass Main{
publicstaticvoidmain(String[]args){
System.out.println("Hello World!");// Prints the string to the console.
}
}
</pre>
</div>
<p>All source files must be named after the public class they contain, appending the suffix <code>.java</code>, for example, <code>HelloWorldApp.java</code>. It must first be compiled into bytecode, using a Java compiler, producing a file with the <code>.class</code> suffix (<code>Main.class</code>, in this case). Only then can it be executed or launched. The Java source file may only contain one public class, but it can contain multiple classes with a non-public access modifier and any number of public inner classes. When the source file contains multiple classes, it is necessary to make one class (introduced by the <code><b>class</b></code> keyword) public (preceded by the <code><b>public</b></code> keyword) and name the source file with that public class name.</p>
<p>A class that is not declared public may be stored in any <code>.java</code> file. The compiler will generate a class file for each class defined in the source file. The name of the class file is the name of the class, with <i>.class</i> appended. For class file generation, anonymous classes are treated as if their name were the concatenation of the name of their enclosing class, a <i>$</i>, and an integer.</p>
<p>The keyword <code><b>public</b></code> denotes that a method can be called from code in other classes, or that a class may be used by classes outside the class hierarchy. The class hierarchy is related to the name of the directory in which the .java file is located. This is called an access level modifier. Other access level modifiers include the keywords <code><b>private</b></code> (a method that can only be accessed in the same class) and <code><b>protected</b></code> (which allows code from the same package to access). If a piece of code attempts to access private methods or protected methods, the JVM will throw a <code>SecurityException</code>.</p>
<p>The keyword <code><b>static</b></code> in front of a method indicates a static method, which is associated only with the class and not with any specific instance of that class. Only static methods can be invoked without a reference to an object. Static methods cannot access any class members that are not also static. Methods that are not designated static are instance methods and require a specific instance of a class to operate.</p>
<p>The keyword <code><b>void</b></code> indicates that the main method does not return any value to the caller. If a Java program is to exit with an error code, it must call <code>System.exit()</code> explicitly.</p>
<p>The method name <code>main</code> is not a keyword in the Java language. It is simply the name of the method the Java launcher calls to pass control to the program. Java classes that run in managed environments such as applets and Enterprise JavaBeans do not use or need a <code>main()</code> method. A Java program may contain multiple classes that have <code>main</code> methods, which means that the VM needs to be explicitly told which class to launch from.</p>
<p>The main method must accept an array of <b><code>String</code></b> objects. By convention, it is referenced as <code><b>args</b></code> although any other legal identifier name can be used. Since Java 5, the main method can also use variable arguments, in the form of <code>public static void main(String... args)</code>, allowing the main method to be invoked with an arbitrary number of <code>String</code> arguments. The effect of this alternate declaration is semantically identical (to the <code>args</code> parameter which is still an array of <code>String</code> objects), but it allows an alternative syntax for creating and passing the array.</p>
<p>The Java launcher launches Java by loading a given class (specified on the command line or as an attribute in a JAR) and starting its <code>public static void main(String[])</code> method. Stand-alone programs must declare this method explicitly. The <code>String[] args</code> parameter is an array of <code>String</code> objects containing any arguments passed to the class. The parameters to <code>main</code> are often passed by means of a command line.</p>
<p>Printing is part of a Java standard library: The <b><code>System</code></b> class defines a public static field called <b><code>out</code></b>. The <code>out</code> object is an instance of the <code>PrintStream</code> class and provides many methods for printing data to standard out, including <b><code>println(String)</code></b> which also appends a new line to the passed string.</p>
<p>The string <code>"Hello World!"</code> is automatically converted to a String object by the compiler.</p>
</section>
<section data-mw-section-id="12">
<h3>Example with methods</h3>
<div class="mw-highlight mw-highlight-lang-java mw-content-ltr mw-highlight-lines">
<pre>// This is an example of a single line comment using two slashes
/*
* This is an example of a multiple line comment using the slash and asterisk.
* This type of comment can be used to hold a lot of information or deactivate
* code, but it is very important to remember to close the comment.
*/
packagefibsandlies;
importjava.util.Map;
importjava.util.HashMap;
/**
* This is an example of a Javadoc comment; Javadoc can compile documentation
* from this text. Javadoc comments must immediately precede the class, method,
* or field being documented.
* @author Wikipedia Volunteers
*/
publicclass FibCalculatorextendsFibonacciimplementsCalculator{
privatestaticMap&lt;Integer,Integer&gt;memoized=newHashMap&lt;&gt;();
/*
* The main method written as follows is used by the JVM as a starting point
* for the program.
*/
publicstaticvoidmain(String[]args){
memoized.put(1,1);
memoized.put(2,1);
System.out.println(fibonacci(12));// Get the 12th Fibonacci number and print to console
}
/**
* An example of a method written in Java, wrapped in a class.
* Given a non-negative number FIBINDEX, returns
* the Nth Fibonacci number, where N equals FIBINDEX.
*
* @param fibIndex The index of the Fibonacci number
* @return the Fibonacci number
*/
publicstaticintfibonacci(intfibIndex){
if(memoized.containsKey(fibIndex)){
returnmemoized.get(fibIndex);
}
intanswer=fibonacci(fibIndex-1)+fibonacci(fibIndex-2);
memoized.put(fibIndex,answer);
returnanswer;
}
}
</pre>
</div>
</section>
</section>
<section data-mw-section-id="13">
<h2>Special classes</h2>
<section data-mw-section-id="14">
<h3>Applet</h3>
<p>Java applets were programs that were embedded in other applications, typically in a Web page displayed in a web browser. The Java applet API is now deprecated since Java 9 in 2017.</p>
</section>
<section data-mw-section-id="15">
<h3>Servlet</h3>
<p>Java servlet technology provides Web developers with a simple, consistent mechanism for extending the functionality of a Web server and for accessing existing business systems. Servlets are server-side Java EE components that generate responses to requests from clients. Most of the time, this means generating HTML pages in response to HTTP requests, although there are a number of other standard servlet classes available, for example for WebSocket communication.</p>
<p>The Java servlet API has to some extent been superseded (but still used under the hood) by two standard Java technologies for web services:</p>
<ul>
<li>the <a href="Java_API_for_RESTful_Web_Services" class="mw-redirect">Java API for RESTful Web Services</a> (JAX-RS 2.0) useful for AJAX, JSON and REST services, and</li>
<li>the <a href="Java_API_for_XML_Web_Services" class="mw-redirect">Java API for XML Web Services</a> (JAX-WS) useful for <a href="SOAP">SOAP</a> <a href="Web_Service" class="mw-redirect">Web Services</a>.</li>
</ul>
<p>Typical implementations of these APIs on Application Servers or Servlet Containers use a standard servlet for handling all interactions with the HTTP requests and responses that delegate to the web service methods for the actual business logic.</p>
</section>
<section data-mw-section-id="16">
<h3>JavaServer Pages</h3>
<p>JavaServer Pages (JSP) are server-side Java EE components that generate responses, typically HTML pages, to HTTP requests from clients. JSPs embed Java code in an HTML page by using the special delimiters <code>&lt;%</code> and <code>%&gt;</code>. A JSP is compiled to a Java <i>servlet</i>, a Java application in its own right, the first time it is accessed. After that, the generated servlet creates the response.</p>
</section>
<section data-mw-section-id="17">
<h3>Swing application</h3>
<p>Swing is a graphical user interface library for the Java SE platform. It is possible to specify a different look and feel through the pluggable look and feel system of Swing. Clones of Windows, GTK+, and Motif are supplied by Sun. Apple also provides an Aqua look and feel for macOS. Where prior implementations of these looks and feels may have been considered lacking, Swing in Java SE 6 addresses this problem by using more native GUI widget drawing routines of the underlying platforms.</p>
</section>
<section data-mw-section-id="18">
<h3>JavaFX application</h3>
<p>JavaFX is a software platform for creating and delivering desktop applications, as well as rich web applications that can run across a wide variety of devices. JavaFX is intended to replace Swing as the standard GUI library for Java SE, but since JDK 11 JavaFX has not been in the core JDK and instead in a separate module. JavaFX has support for desktop computers and web browsers on Microsoft Windows, Linux, and macOS. JavaFX does not have support for native OS look and feels.</p>
</section>
<section data-mw-section-id="19">
<h3>Generics</h3>
<p>In 2004, generics were added to the Java language, as part of J2SE 5.0. Prior to the introduction of generics, each variable declaration had to be of a specific type. For container classes, for example, this is a problem because there is no easy way to create a container that accepts only specific types of objects. Either the container operates on all subtypes of a class or interface, usually <code>Object</code>, or a different container class has to be created for each contained class. Generics allow compile-time type checking without having to create many container classes, each containing almost identical code. In addition to enabling more efficient code, certain runtime exceptions are prevented from occurring, by issuing compile-time errors. If Java prevented all runtime type errors (<code>ClassCastException</code>s) from occurring, it would be type safe.</p>
<p>In 2016, the type system of Java was proven unsound in that it is possible to use generics to construct classes and methods that allow assignment of an instance one class to a variable of another unrelated class. Such code is accepted by the compiler, but fails at run time with a class cast exception.</p>
</section>
</section>
<section data-mw-section-id="20">
<h2>Criticism</h2>
<p>Criticisms directed at Java include the implementation of generics, speed, the handling of unsigned numbers, the implementation of floating-point arithmetic, and a history of security vulnerabilities in the primary Java VM implementation HotSpot.</p>
</section>
<section data-mw-section-id="21">
<h2>Class libraries</h2>
<p>The Java Class Library is the standard library, developed to support application development in Java. It is controlled by Oracle in cooperation with others through the Java Community Process program. Companies or individuals participating in this process can influence the design and development of the APIs. This process has been a subject of controversy during the 2010s. The class library contains features such as:</p>
<ul>
<li>The core libraries, which include:
<ul>
<li>IO/<a href="https://docs.oracle.com/javase/8/docs/api/java/nio/package-summary.html" class="external text external">NIO</a></li>
<li><a href="https://docs.oracle.com/javase/8/docs/technotes/guides/net/index.html" class="external text external">Networking</a> (NOTE: new <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java.net.http/java/net/http/HttpClient.html" class="external text external">HTTP Client</a> since Java 11)</li>
<li><a href="Reflection_(computer_programming)" class="mw-redirect">Reflection</a></li>
<li><a href="Concurrent_computing">Concurrency</a></li>
<li><a href="Generics_in_Java">Generics</a></li>
<li>Scripting/Compiler</li>
<li><a href="Functional_programming">Functional programming</a> (Lambda, Streaming)</li>
<li><a href="Java_collections_framework">Collection libraries</a> that implement <a href="Data_structure">data structures</a> such as <a href="List_(abstract_data_type)">lists</a>, <a href="Associative_array">dictionaries</a>, <a href="Tree_structure">trees</a>, <a href="Set_(abstract_data_type)">sets</a>, <a href="Queue_(abstract_data_type)">queues</a> and <a href="Double-ended_queue">double-ended queue</a>, or <a href="Stack_(abstract_data_type)">stacks</a></li>
<li><a href="XML">XML</a> Processing (Parsing, Transforming, Validating) libraries</li>
<li><a href="Computer_security">Security</a></li>
<li><a href="Internationalization_and_localization">Internationalization and localization</a> libraries</li>
</ul></li>
<li>The integration libraries, which allow the application writer to communicate with external systems. These libraries include:
<ul>
<li>The <a href="Java_Database_Connectivity">Java Database Connectivity</a> (JDBC) <a href="Application_programming_interface" class="mw-redirect">API</a> for database access</li>
<li><a href="Java_Naming_and_Directory_Interface">Java Naming and Directory Interface</a> (JNDI) for lookup and discovery</li>
<li><a href="Java_remote_method_invocation">Java remote method invocation</a> (RMI) and <a href="Common_Object_Request_Broker_Architecture">Common Object Request Broker Architecture</a> (CORBA) for distributed application development</li>
<li><a href="Java_Management_Extensions">Java Management Extensions</a> (JMX) for managing and monitoring applications</li>
</ul></li>
<li><a href="User_interface">User interface</a> libraries, which include:
<ul>
<li>The (heavyweight, or <a href="Native_(computing)">native</a>) <a href="Abstract_Window_Toolkit">Abstract Window Toolkit</a> (AWT), which provides <a href="Graphical_user_interface">GUI</a> components, the means for laying out those components and the means for handling events from those components</li>
<li>The (lightweight) <a href="Swing_(Java)">Swing</a> libraries, which are built on AWT but provide (non-native) implementations of the AWT widgetry</li>
<li>APIs for audio capture, processing, and playback</li>
<li><a href="JavaFX">JavaFX</a></li>
</ul></li>
<li>A platform dependent implementation of the Java virtual machine that is the means by which the bytecodes of the Java libraries and third party applications are executed</li>
<li>Plugins, which enable <a href="Java_applet">applets</a> to be run in web browsers</li>
<li><a href="Java_Web_Start">Java Web Start</a>, which allows Java applications to be efficiently distributed to <a href="End_user">end users</a> across the Internet</li>
<li>Licensing and documentation</li>
</ul>
</section>
<section data-mw-section-id="22">
<h2>Documentation</h2>
<p>Javadoc is a comprehensive documentation system, created by Sun Microsystems. It provides developers with an organized system for documenting their code. Javadoc comments have an extra asterisk at the beginning, i.e. the delimiters are <code>/**</code> and <code>*/</code>, whereas the normal multi-line comments in Java are delimited by <code>/*</code> and <code>*/</code>, and single-line comments start with <code>//</code>.</p>
</section>
<section data-mw-section-id="23">
<h2>Implementations</h2>
<p>Oracle Corporation is the current owner of the official implementation of the Java SE platform, following their acquisition of Sun Microsystems on January 27, 2010. This implementation is based on the original implementation of Java by Sun. The Oracle implementation is available for Microsoft Windows (still works for XP, while only later versions are currently officially supported), macOS, Linux, and Solaris. Because Java lacks any formal standardization recognized by Ecma International, ISO/IEC, ANSI, or other third-party standards organizations, the Oracle implementation is the de facto standard.</p>
<p>The Oracle implementation is packaged into two different distributions: The Java Runtime Environment (JRE) which contains the parts of the Java SE platform required to run Java programs and is intended for end users, and the Java Development Kit (JDK), which is intended for software developers and includes development tools such as the Java compiler, Javadoc, Jar, and a debugger. Oracle has also released GraalVM, a high performance Java dynamic compiler and interpreter.</p>
<p>OpenJDK is another notable Java SE implementation that is licensed under the GNU GPL. The implementation started when Sun began releasing the Java source code under the GPL. As of Java SE 7, OpenJDK is the official Java reference implementation.</p>
<p>The goal of Java is to make all implementations of Java compatible. Historically, Sun's trademark license for usage of the Java brand insists that all implementations be <i>compatible</i>. This resulted in a legal dispute with Microsoft after Sun claimed that the Microsoft implementation did not support Java remote method invocation (RMI) or Java Native Interface (JNI) and had added platform-specific features of their own. Sun sued in 1997, and, in 2001, won a settlement of US$20 million, as well as a court order enforcing the terms of the license from Sun. As a result, Microsoft no longer ships Java with Windows.</p>
<p>Platform-independent Java is essential to Java EE, and an even more rigorous validation is required to certify an implementation. This environment enables portable server-side applications.</p>
</section>
<section data-mw-section-id="24">
<h2>Use outside the Java platform</h2>
<p>The Java programming language requires the presence of a software platform in order for compiled programs to be executed.</p>
<p>Oracle supplies the Java platform for use with Java. The Android SDK is an alternative software platform, used primarily for developing Android applications with its own GUI system.</p>
<section data-mw-section-id="25">
<h3>Android</h3>
<p>The Java language is a key pillar in Android, an open source mobile operating system. Although Android, built on the Linux kernel, is written largely in C, the Android SDK uses the Java language as the basis for Android applications but does not use any of its standard GUI, SE, ME or other established Java standards. The bytecode language supported by the Android SDK is incompatible with Java bytecode and runs on its own virtual machine, optimized for low-memory devices such as smartphones and tablet computers. Depending on the Android version, the bytecode is either interpreted by the Dalvik virtual machine or compiled into native code by the Android Runtime.</p>
<p>Android does not provide the full Java SE standard library, although the Android SDK does include an independent implementation of a large subset of it. It supports Java 6 and some Java 7 features, offering an implementation compatible with the standard library (Apache Harmony).</p>
<section data-mw-section-id="26">
<h4>Controversy</h4>
<p>The use of Java-related technology in Android led to a legal dispute between Oracle and Google. On May 7, 2012, a San Francisco jury found that if APIs could be copyrighted, then Google had infringed Oracle's copyrights by the use of Java in Android devices. District Judge William Alsup ruled on May 31, 2012, that APIs cannot be copyrighted, but this was reversed by the United States Court of Appeals for the Federal Circuit in May 2014. On May 26, 2016, the district court decided in favor of Google, ruling the copyright infringement of the Java API in Android constitutes fair use. In March 2018, this ruling was overturned by the Appeals Court, which sent down the case of determining the damages to federal court in San Francisco. Google filed a petition for writ of certiorari with the Supreme Court of the United States in January 2019 to challenge the two rulings that were made by the Appeals Court in Oracle's favor. On April 5, 2021, the Court ruled 6-2 in Google's favor, that its use of Java APIs should be considered fair use. However, the court refused to rule on the copyrightability of APIs, choosing instead to determine their ruling by considering Java's API copyrightable "purely for arguments sake."</p>
</section>
</section>
</section>
<section data-mw-section-id="27">
<h2>See also</h2>
<ul>
<li><a href="C_Sharp_(programming_language)">C#</a></li>
<li><a href="C%2B%2B">C++</a></li>
<li><a href="Dalvik_(software)">Dalvik</a>, used in old Android versions, replaced by non-JIT <a href="Android_Runtime">Android Runtime</a></li>
<li><a href="Java_Heterogeneous_Distributed_Computing">Java Heterogeneous Distributed Computing</a></li>
<li><a href="List_of_Java_virtual_machines">List of Java virtual machines</a></li>
<li><a href="List_of_Java_APIs">List of Java APIs</a></li>
<li><a href="List_of_JVM_languages">List of JVM languages</a></li>
</ul>
<section data-mw-section-id="28">
<h3>Comparison of Java with other languages</h3>
<ul>
<li><a href="Comparison_of_C_Sharp_and_Java">Comparison of C# and Java</a></li>
<li><a href="Comparison_of_Java_and_C%2B%2B">Comparison of Java and C++</a></li>
<li><a href="Comparison_of_programming_languages">Comparison of programming languages</a></li>
</ul>
</section>
</section>
<section data-mw-section-id="29">
</section>
<section data-mw-section-id="30">
</section>
<section data-mw-section-id="31">
<h2>External links</h2>
<ul>
<li><a href="https://www.oracle.com/java/" class="external text external">Official Website</a></li>
<li> The dictionary definition of <a href="https://en.wiktionary.org/wiki/Java" class="extiw external"><i>Java</i></a> at Wiktionary</li>
<li> Media related to <a href="https://commons.wikimedia.org/wiki/Category:Java%20(programming%20language)" class="extiw external">Java</a> at Wikimedia Commons</li>
<li> <a href="https://en.wikibooks.org/wiki/Java%20Programming" class="extiw external">Java Programming</a> at Wikibooks</li>
<li> Learning materials related to <a href="https://en.wikiversity.org/wiki/Java" class="extiw external">Java</a> at Wikiversity</li>
<li><a href="https://discu.eu/weekly/java/" class="external text external">Java Weekly</a></li>
</ul>
</section>
</div>
</section>
<section id="side">
<div id="links">
<h2>Search</h2>
<form action="/search" method="get">
<div class="search-form">
<input type="text" name="q" placeholder="Search" value="">
<input type="submit" value="Search">
</div>
</form>
<h2>Index</h2>
<ul>
<li><a href="/article/Java_Pony">Java Pony</a></li>
<li><a href="/article/Java_Portlet_Specification">Java Portlet Specification</a></li>
<li><a href="/article/Java_processor">Java processor</a></li>
<li><b>Java (programming language)</b></li>
<li><a href="/article/Java_razorfish">Java razorfish</a></li>
<li><a href="/article/Java_remote_method_invocation">Java remote method invocation</a></li>
<li><a href="/article/Java_Research_License">Java Research License</a></li>
</ul>
</div>
</section>
</article>
<footer>
This encyclopedia contains articles issued from <a rel="nofollow" href="https://en.wikipedia.org/">Wikipedia</a>.
The text is licensed under <a rel="nofollow" href="https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License">CC BY-SA 3.0</a>.
The wikipedia contents are from OpenZIM dumps, which typically lag behind the main Wikipedia project by up to a year.
</footer>
</body>
</html>

View File

@ -8,6 +8,9 @@ import org.apache.commons.lang3.StringUtils;
*/
public record ContentType(String contentType, String charset) {
public static ContentType parse(String contentTypeHeader) {
if (contentTypeHeader == null || contentTypeHeader.isBlank())
return new ContentType(null, null);
String[] parts = StringUtils.split(contentTypeHeader, ";", 2);
String contentType = parts[0].trim();
String charset = parts.length > 1 ? parts[1].trim() : "UTF-8";

View File

@ -8,7 +8,11 @@ public class DocumentBodyToString {
public static String getStringData(ContentType type, byte[] data) {
Charset charset;
try {
charset = Charset.forName(type.charset());
if (type.charset() == null || type.charset().isBlank())
charset = StandardCharsets.UTF_8;
else {
charset = Charset.forName(type.charset());
}
}
catch (IllegalCharsetNameException ex) {
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?

View File

@ -17,6 +17,8 @@ dependencies {
implementation project(':code:common:db')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:service-client')
implementation project(':code:api:query-api')
implementation libs.bundles.slf4j
implementation libs.bundles.mariadb

View File

@ -5,6 +5,7 @@ import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.query.client.QueryClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -15,14 +16,18 @@ import java.util.function.IntConsumer;
@Singleton
public class RankingDomainFetcher {
protected final HikariDataSource dataSource;
private final QueryClient queryClient;
protected final DomainBlacklistImpl blacklist;
protected final Logger logger = LoggerFactory.getLogger(getClass());
protected boolean getNames = false;
@Inject
public RankingDomainFetcher(HikariDataSource dataSource, DomainBlacklistImpl blacklist) {
public RankingDomainFetcher(HikariDataSource dataSource,
QueryClient queryClient,
DomainBlacklistImpl blacklist) {
this.dataSource = dataSource;
this.queryClient = queryClient;
this.blacklist = blacklist;
}
@ -33,10 +38,10 @@ public class RankingDomainFetcher {
public void getDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE NODE_AFFINITY>0 GROUP BY EC_DOMAIN.ID";
}
else {
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE NODE_AFFINITY>0 GROUP BY EC_DOMAIN.ID";
}
getDomains(query, consumer);
@ -77,23 +82,14 @@ public class RankingDomainFetcher {
}
public void eachDomainLink(DomainLinkConsumer consumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK"))
{
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
var allLinks = queryClient.getAllDomainLinks();
var iter = allLinks.iterator();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
consumer.accept(src, dst);
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domain links", ex);
while (iter.advance()) {
consumer.accept(iter.source(), iter.dest());
}
}
public void domainsByPattern(String pattern, IntConsumer idConsumer) {

View File

@ -4,6 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.query.client.QueryClient;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
@ -14,8 +15,8 @@ public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher
final boolean hasData;
@Inject
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, DomainBlacklistImpl blacklist) {
super(dataSource, blacklist);
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, QueryClient queryClient, DomainBlacklistImpl blacklist) {
super(dataSource, queryClient, blacklist);
hasData = isDomainNeighborTablePopulated(dataSource);
}
@ -61,17 +62,6 @@ public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher
}
public void getDomains(Consumer<RankingDomainData> consumer) {
// String query =
// """
// SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
// FROM EC_DOMAIN
// LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
// INNER JOIN EC_DOMAIN_LINK ON DEST_DOMAIN_ID=EC_DOMAIN.ID
// WHERE SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID
// GROUP BY EC_DOMAIN.ID
// HAVING COUNT(SOURCE_DOMAIN_ID)>5
// """;
String query;
if (getNames) {
query =

View File

@ -1,71 +0,0 @@
package nu.marginalia.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.ranking.StandardPageRank;
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
import nu.marginalia.service.module.DatabaseModule;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.concurrent.LinkedBlockingQueue;
public class CreateBrowseDomainRanksTool {
private static final Logger logger = LoggerFactory.getLogger(CreateBrowseDomainRanksTool.class);
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true;
@SneakyThrows
public static void main(String... args) {
Driver driver = new Driver();
var conn = new DatabaseModule().provideConnection();
long start = System.currentTimeMillis();
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
logger.info("Ranking");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcherForSimilarityData(ds, new DomainBlacklistImpl(ds));
var rpr = new StandardPageRank(domains, args);
uploader.start();
var rankData = rpr.pageRankWithPeripheralNodes(1000, RankingResultListAccumulator::new);
rankData.forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
e.printStackTrace();
}
return true;
});
long end = System.currentTimeMillis();
running = false;
uploader.join();
logger.info("Done in {}", (end - start)/1000.0);
}
public static void uploadThread(HikariDataSource dataSource) {
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_RANDOM_DOMAINS(DOMAIN_SET, DOMAIN_ID) VALUES (3, ?)")) {
while (running || (!running && !uploadQueue.isEmpty())) {
var job = uploadQueue.take();
stmt.setInt(1, job);
stmt.executeUpdate();
}
}
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();
}
}
}

View File

@ -1,264 +0,0 @@
package nu.marginalia.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntComparator;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import nu.marginalia.ranking.RankingAlgorithm;
import nu.marginalia.ranking.data.RankingDomainData;
import nu.marginalia.ranking.data.RankingDomainFetcher;
import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.service.module.DatabaseModule;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.stream.IntStream;
public class PerusePageRankV2 {
final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
TIntArrayList[] linkDataSrc2Dest;
TIntArrayList[] linkDataDest2Src;
private final Logger logger = LoggerFactory.getLogger(getClass());
static final LinkedBlockingQueue<LinkAdjacencies> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true;
public int indexMax() {
return domainIndexToId.size();
}
public int getDomainId(int idx) {
return domainIndexToId.get(idx);
}
@SneakyThrows
public static void main(String... args) {
var ds = new DatabaseModule().provideConnection();
var blacklist = new DomainBlacklistImpl(ds);
var rank = new PerusePageRankV2(new RankingDomainFetcher(ds, blacklist));
long start = System.currentTimeMillis();
var uploader = new Thread(() -> uploadThread(ds));
uploader.start();
IntStream.range(0, rank.indexMax()).parallel().forEach(i -> {
int[] ids = rank.pageRank(i, 25).toArray();
try {
uploadQueue.put(new LinkAdjacencies(rank.getDomainId(i), ids));
} catch (InterruptedException e) {
e.printStackTrace();
}
});
long end = System.currentTimeMillis();
running = false;
uploader.join();
System.out.printf("%2.2f", (end - start)/1000.0);
}
@AllArgsConstructor
static class LinkAdjacencies {
public final int id;
public final int[] neighbors;
}
public static void uploadThread(HikariDataSource dataSource) {
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement("INSERT INTO EC_DOMAIN_NEIGHBORS(DOMAIN_ID, NEIGHBOR_ID, ADJ_IDX) VALUES (?,?,?) ON DUPLICATE KEY UPDATE NEIGHBOR_ID=VALUES(NEIGHBOR_ID)")) {
while (running || (!running && !uploadQueue.isEmpty())) {
var job = uploadQueue.take();
for (int i = 0; i < job.neighbors.length; i++) {
stmt.setInt(1, job.id);
stmt.setInt(2, job.neighbors[i]);
stmt.setInt(3, i);
stmt.addBatch();
}
stmt.executeBatch();
}
}
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();
}
}
public PerusePageRankV2(RankingDomainFetcher domainFetcher) {
domainFetcher.getDomains(domainData -> {
int id = domainData.id;
domainsById.put(id, domainData);
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
});
domainFetcher.getPeripheralDomains(domainData -> {
int id = domainData.id;
domainsById.put(id, domainData);
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
});
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
domainFetcher.eachDomainLink((src, dst) -> {
if (src == dst) return;
if (domainsById.contains(src) && domainsById.contains(dst)) {
int srcIdx = domainIdToIndex.get(src);
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
}
linkDataSrc2Dest[srcIdx].add(dstIdx);
if (linkDataDest2Src[dstIdx] == null) {
linkDataDest2Src[dstIdx] = new TIntArrayList();
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
});
}
public TIntList pageRank(int origin, int resultCount) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 10;
for (int i = 0; i < iter_max; i++) {
RankVector newRank = createNewRankVector(rank);
double oldNorm = rank.norm();
double newNorm = newRank.norm();
double dNorm = oldNorm - newNorm ;
newRank.increment(origin, dNorm/oldNorm);
rank = newRank;
}
rank.increment(origin, -1);
return rank.getRanking(resultCount);
}
@NotNull
private RankVector createNewRankVector(RankVector rank) {
double rankNorm = rank.norm();
RankVector newRank = new RankVector(0);
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
var links = linkDataSrc2Dest[domainId];
double newRankValue = 0;
if (links != null && links.size() > 0) {
for (int j = 0; j < links.size(); j++) {
var revLinks = linkDataDest2Src[links.getQuick(j)];
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();
}
}
newRank.set(domainId, 0.85*newRankValue/rankNorm);
}
return newRank;
}
public class RankVector {
private final double[] rank;
public RankVector(double defaultValue) {
rank = new double[domainIndexToId.size()];
if (defaultValue != 0.) {
Arrays.fill(rank, defaultValue);
}
}
public void set(int id, double value) {
rank[id] = value;
}
public void increment(int id, double value) {
rank[id] += value;
}
public double get(int id) {
if (id >= rank.length) return 0.;
return rank[id];
}
public double norm() {
double v = 0.;
for (int i = 0; i < rank.length; i++) {
if (rank[i] > 0) { v+=rank[i]; }
else { v -= rank[i]; }
}
return v;
}
public double norm(RankingAlgorithm.RankVector other) {
double v = 0.;
for (int i = 0; i < rank.length; i++) {
double dv = rank[i] - other.get(i);
if (dv > 0) { v+=dv; }
else { v -= dv; }
}
return v;
}
public TIntList getRanking(int numResults) {
if (numResults < 0) {
numResults = domainIdToIndex.size();
}
TIntArrayList list = new TIntArrayList(numResults);
int[] nodes = new int[rank.length];
Arrays.setAll(nodes, i->i);
IntComparator comp = (i,j) -> (int) Math.signum(rank[j] - rank[i]);
IntArrays.quickSort(nodes, comp);
int i;
for (i = 0; i < numResults; i++) {
int id = domainIndexToId.get(nodes[i]);
if (!domainsById.get(id).isAlias())
list.add(id);
}
for (; i < nodes.length && domainsById.size() < numResults; i++) {
int id = domainIndexToId.get(nodes[i]);
if (!domainsById.get(id).isAlias())
list.add(id);
}
return list;
}
}
}

View File

@ -1,67 +0,0 @@
package nu.marginalia.ranking.tool;
import lombok.SneakyThrows;
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.ranking.data.RankingDomainFetcher;
import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.ranking.StandardPageRank;
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
import nu.marginalia.service.module.DatabaseModule;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
public class PrintDomainRanksTool {
private static final Logger logger = LoggerFactory.getLogger(PrintDomainRanksTool.class);
private volatile static int rankMax;
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true;
@SneakyThrows
public static void main(String... args) {
Driver driver = new Driver();
var conn = new DatabaseModule().provideConnection();
long start = System.currentTimeMillis();
logger.info("Ranking");
var ds = new DatabaseModule().provideConnection();
RankingDomainFetcher domains;
if (Boolean.getBoolean("use-link-data")) {
domains = new RankingDomainFetcher(ds, new DomainBlacklistImpl(ds));
domains.retainNames();
}
else {
domains = new RankingDomainFetcherForSimilarityData(ds, new DomainBlacklistImpl(ds));
domains.retainNames();
}
var rpr = new StandardPageRank(domains, args);
rankMax = rpr.size();
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
AtomicInteger cnt = new AtomicInteger();
rankData.forEach(i -> {
var data = rpr.getDomainData(i);
System.out.printf("%d %s %s\n", cnt.getAndIncrement(), data.name, data.state);
return true;
});
long end = System.currentTimeMillis();
running = false;
logger.info("Done in {}", (end - start)/1000.0);
}
}

View File

@ -1,85 +0,0 @@
package nu.marginalia.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.ranking.StandardPageRank;
import nu.marginalia.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.service.module.DatabaseModule;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.concurrent.LinkedBlockingQueue;
public class UpdateDomainRanksTool {
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
private volatile static int rankMax;
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true;
@SneakyThrows
public static void main(String... args) {
Driver driver = new Driver();
var conn = new DatabaseModule().provideConnection();
long start = System.currentTimeMillis();
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
logger.info("Ranking");
var domains = new RankingDomainFetcherForSimilarityData(conn, new DomainBlacklistImpl(conn));
var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com");
rankMax = rpr.size();
uploader.start();
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
rankData.forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
e.printStackTrace();
}
return true;
});
long end = System.currentTimeMillis();
running = false;
uploader.join();
logger.info("Done in {}", (end - start)/1000.0);
}
public static void uploadThread(HikariDataSource dataSource) {
int i = 0;
try (var conn = dataSource.getConnection()) {
logger.info("Resetting rank");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=1")) {
stmt.executeUpdate();
}
logger.info("Updating ranks");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=? WHERE ID=?")) {
while (running || (!running && !uploadQueue.isEmpty())) {
var job = uploadQueue.take();
stmt.setDouble(1, i++ / (double) rankMax);
stmt.setInt(2, job);
stmt.executeUpdate();
}
}
logger.info("Recalculating quality");
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();
}
}
}

View File

@ -3,7 +3,6 @@ package nu.marginalia.browse;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DomainBlacklist;

View File

@ -1,132 +0,0 @@
package nu.marginalia.browse;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DomainBlacklist;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.*;
@Singleton
public class DbBrowseDomainsSimilarOldAlgo {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final HikariDataSource dataSource;
@Inject
public DbBrowseDomainsSimilarOldAlgo(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
public List<BrowseResult> getDomainNeighborsAdjacent(int domainId, DomainBlacklist blacklist, int count) {
final Set<BrowseResult> domains = new HashSet<>(count*3);
final String q = """
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT, INDEXED
FROM EC_DOMAIN_NEIGHBORS
INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK ON DEST_DOMAIN_ID=EC_DOMAIN.ID
WHERE
STATE<2
AND KNOWN_URLS<1000
AND DOMAIN_ALIAS IS NULL
AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ?
GROUP BY EC_DOMAIN.ID
HAVING CNT < 100
ORDER BY ADJ_IDX
LIMIT ?
""";
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(q)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId);
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED")));
}
}
}
if (domains.size() < count/2) {
final String q2 = """
SELECT EC_DOMAIN.ID, DOMAIN_NAME, INDEXED
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID
INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID
WHERE B.SOURCE_DOMAIN_ID=?
AND STATE<2
AND KNOWN_URLS<1000
AND DOMAIN_ALIAS IS NULL
GROUP BY EC_DOMAIN.ID
HAVING COUNT(*) < 100 ORDER BY RANK ASC LIMIT ?""";
try (var stmt = connection.prepareStatement(q2)) {
stmt.setFetchSize(count/2);
stmt.setInt(1, domainId);
stmt.setInt(2, count/2 - domains.size());
var rsp = stmt.executeQuery();
while (rsp.next() && domains.size() < count/2) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED")));
}
}
}
}
if (domains.size() < count/2) {
final String q3 = """
SELECT EC_DOMAIN.ID, DOMAIN_NAME, INDEXED
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID
INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID
WHERE B.DEST_DOMAIN_ID=?
AND STATE<2
AND KNOWN_URLS<1000
AND DOMAIN_ALIAS IS NULL
GROUP BY EC_DOMAIN.ID
HAVING COUNT(*) < 100
ORDER BY RANK ASC
LIMIT ?""";
try (var stmt = connection.prepareStatement(q3)) {
stmt.setFetchSize(count/2);
stmt.setInt(1, domainId);
stmt.setInt(2, count/2 - domains.size());
var rsp = stmt.executeQuery();
while (rsp.next() && domains.size() < count/2) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED")));
}
}
}
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return new ArrayList<>(domains);
}
}

View File

@ -19,21 +19,20 @@ public class ProcessingIterator<T> implements Iterator<T> {
private final LinkedBlockingQueue<T> queue;
private final AtomicBoolean isFinished = new AtomicBoolean(false);
private final ExecutorService executorService;
private final Semaphore sem;
private final SimpleBlockingThreadPool pool;
private T next = null;
private final int parallelism;
public ProcessingIterator(int queueSize, int parallelism, ProcessingJob<T> task) {
this.parallelism = parallelism;
@SneakyThrows
ProcessingIterator(SimpleBlockingThreadPool pool, int queueSize, ProcessingJob<T> task) {
queue = new LinkedBlockingQueue<>(queueSize);
executorService = Executors.newFixedThreadPool(parallelism);
sem = new Semaphore(parallelism);
this.pool = pool;
executorService.submit(() -> executeJob(task));
pool.submit(() -> executeJob(task));
}
public static Factory factory(int queueSize, int parallelism) {
return new Factory(queueSize, parallelism);
}
private void executeJob(ProcessingJob<T> job) {
@ -46,20 +45,15 @@ public class ProcessingIterator<T> implements Iterator<T> {
}
}
@SneakyThrows
private void executeTask(Task<T> task) {
try {
sem.acquire();
} catch (InterruptedException e) {
return;
}
try {
queue.put(task.get());
} catch (Exception e) {
logger.warn("Exception while processing", e);
} finally {
sem.release();
}
pool.submit(() -> {
try {
queue.put(task.get());
} catch (Exception e) {
logger.warn("Exception while processing", e);
}
});
}
/** Returns true if there are more documents to be processed.
@ -75,16 +69,12 @@ public class ProcessingIterator<T> implements Iterator<T> {
return true;
do {
next = queue.poll(1, TimeUnit.SECONDS);
next = queue.poll(50, TimeUnit.MILLISECONDS);
if (next != null) {
return true;
}
} while (expectMore());
if (!executorService.isShutdown()) {
executorService.shutdown();
}
return false;
}
@ -95,7 +85,7 @@ public class ProcessingIterator<T> implements Iterator<T> {
private boolean expectMore() {
return !isFinished.get() // we are still reading from the database
|| !queue.isEmpty() // ... or we have documents in the queue
|| sem.availablePermits() < parallelism; // ... or we are still processing documents
|| pool.getActiveCount() > 0; // ... or we are still processing documents
}
/** Returns the next document to be processed.
@ -126,14 +116,32 @@ public class ProcessingIterator<T> implements Iterator<T> {
* performed in parallel
*/
public interface ProcessingJob<T2> {
void run(Consumer<Task<T2>> output) throws Exception;
}
/**
* A single task that produces a result to be iterable via the Iterator interface
* (along with other tasks' outputs)
*/
public interface Task<T> {
T get() throws Exception;
}
public static class Factory {
private final int queueSize;
private final SimpleBlockingThreadPool pool;
Factory(int queueSize, int parallelism) {
this.queueSize = queueSize;
this.pool = new SimpleBlockingThreadPool("sideload", parallelism, 4);
}
public <T> ProcessingIterator<T> create(ProcessingJob<T> task) {
return new ProcessingIterator<>(pool, queueSize, task);
}
}
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.util;
import org.junit.jupiter.api.Test;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.TimeUnit;
@ -14,7 +15,7 @@ class ProcessingIteratorTest {
@Test
public void test() {
Set<Integer> output = new HashSet<>();
var iter = new ProcessingIterator<Integer>(2, 2, q -> {
Iterator<Integer> iter = ProcessingIterator.factory(2, 2).create(q -> {
for (int i = 0; i < 10_000; i++) {
int j = i;
q.accept(() -> task(j));

View File

@ -42,6 +42,7 @@ public class SentenceExtractor {
* that might otherwise use an undue amount of processing power. 250 words is about 10X longer than
* this comment. */
private static final int MAX_SENTENCE_LENGTH = 250;
private static final int MAX_TEXT_LENGTH = 65536;
@SneakyThrows @Inject
public SentenceExtractor(LanguageModels models) {
@ -95,7 +96,7 @@ public class SentenceExtractor {
title = doc.getElementsByTag("h2").text();
}
if (title.trim().length() < 3 && textSentences.length > 0) {
if (title.trim().length() < 3) {
for (DocumentSentence textSentence : textSentences) {
if (textSentence.length() > 0) {
title = textSentence.originalSentence.toLowerCase();
@ -136,6 +137,7 @@ public class SentenceExtractor {
String[] sentences;
String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text);
try {
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
}
@ -215,7 +217,12 @@ public class SentenceExtractor {
public String asText(Document dc) {
String text = dc.getElementsByTag("body").text();
return text.substring(0, (int) (text.length()*0.95));
if (text.length() > MAX_TEXT_LENGTH) {
return text.substring(0, MAX_TEXT_LENGTH);
}
else {
return text.substring(0, (int) (text.length() * 0.95));
}
}

View File

@ -1,7 +1,8 @@
package nu.marginalia.crawling.io;
import com.google.gson.Gson;
import nu.marginalia.crawling.io.format.LegacySerializableCrawlDataStream;
import nu.marginalia.crawling.io.format.CompatibleLegacySerializableCrawlDataStream;
import nu.marginalia.crawling.io.format.FastLegacySerializableCrawlDataStream;
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
import nu.marginalia.model.gson.GsonFactory;
@ -15,11 +16,24 @@ public class CrawledDomainReader {
public CrawledDomainReader() {
}
public enum CompatibilityLevel {
/** Data order emulates the ordering of the new format. This is slower */
COMPATIBLE,
/** Data order is not compatible with the new format, but the data itself is */
FAST,
/** Alias for FAST */
ANY
}
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException {
public static SerializableCrawlDataStream createDataStream(CompatibilityLevel compatibilityLevel,
Path fullPath) throws IOException
{
String fileName = fullPath.getFileName().toString();
if (fileName.endsWith(".zstd")) {
return new LegacySerializableCrawlDataStream(gson, fullPath.toFile());
if (compatibilityLevel == CompatibilityLevel.COMPATIBLE)
return new CompatibleLegacySerializableCrawlDataStream(gson, fullPath.toFile());
else // if (compatibilityLevel == CompatibilityLevel.FAST or ANY)
return new FastLegacySerializableCrawlDataStream(gson, fullPath.toFile());
}
else if (fileName.endsWith(".parquet")) {
return new ParquetSerializableCrawlDataStream(fullPath);
@ -30,14 +44,14 @@ public class CrawledDomainReader {
}
/** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */
public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException {
public static SerializableCrawlDataStream createDataStream(CompatibilityLevel level, Path basePath, String domain, String id) throws IOException {
Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain);
if (Files.exists(parquetPath)) {
return createDataStream(parquetPath);
return createDataStream(level, parquetPath);
}
else {
return createDataStream(CrawlerOutputFile.getLegacyOutputFile(basePath, id, domain));
return createDataStream(level, CrawlerOutputFile.getLegacyOutputFile(basePath, id, domain));
}
}

View File

@ -17,6 +17,10 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
SerializableCrawlData next() throws IOException;
/** Return a size hint for the stream. 0 is returned if the hint is not available,
* or if the file is seemed too small to bother */
default int sizeHint() { return 0; }
boolean hasNext() throws IOException;
@Nullable

View File

@ -0,0 +1,107 @@
package nu.marginalia.crawling.io.format;
import com.github.luben.zstd.RecyclingBufferPool;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import java.io.*;
import java.nio.file.Path;
import java.util.Objects;
import static java.util.Objects.*;
/** This class is used to read the old format of crawl data, which was zstd-compressed JSON
* with type delimiters between records. It does its best to preserve the semantics of the
* new format. This is slow.
*/
public class CompatibleLegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
private final Gson gson;
private final BufferedReader bufferedReader;
private CrawledDomain domain;
private SerializableCrawlData next;
private final Path path;
public CompatibleLegacySerializableCrawlDataStream(Gson gson, File file) throws IOException {
this.gson = gson;
path = file.toPath();
domain = findDomain(file);
bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
}
/** Scan through the file and find the domain record */
private CrawledDomain findDomain(File file) throws IOException {
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)))) {
for (;;) {
String identifierLine =
requireNonNull(br.readLine(), "No identifier line found");
String dataLine =
requireNonNull(br.readLine(), "No data line found");
if (identifierLine.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
return gson.fromJson(dataLine, CrawledDomain.class);
}
}
}
}
@Override
public Path path() {
return path;
}
@Override
public SerializableCrawlData next() throws IOException {
if (hasNext()) {
if (domain != null) {
var ret = domain;
domain = null;
return ret;
}
else {
var ret = next;
next = null;
return ret;
}
}
throw new IllegalStateException("No more data");
}
@Override
public boolean hasNext() throws IOException {
if (domain != null || next != null) {
return true;
}
String identifier = bufferedReader.readLine();
if (identifier == null) {
bufferedReader.close();
return false;
}
String data = bufferedReader.readLine();
if (data == null) {
bufferedReader.close();
return false;
}
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
next = null;
return false; // last record is expected to be the domain, so we're done
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
next = gson.fromJson(data, CrawledDocument.class);
} else {
throw new IllegalStateException("Unknown identifier: " + identifier);
}
return true;
}
@Override
public void close() throws Exception {
bufferedReader.close();
}
}

View File

@ -12,15 +12,16 @@ import java.io.*;
import java.nio.file.Path;
/** This class is used to read the old format of crawl data, which was zstd-compressed JSON
* with type delimiters between records.
* with type delimiters between records. It does not preserve the semantics of the new format,
* but it is faster.
*/
public class LegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
public class FastLegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
private final Gson gson;
private final BufferedReader bufferedReader;
private SerializableCrawlData next = null;
private final Path path;
public LegacySerializableCrawlDataStream(Gson gson, File file) throws IOException {
public FastLegacySerializableCrawlDataStream(Gson gson, File file) throws IOException {
this.gson = gson;
bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
path = file.toPath();

View File

@ -14,6 +14,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
@ -37,6 +38,21 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
return path;
}
public int sizeHint() {
// Only calculate size hint for large files
// (the reason we calculate them in the first place is to assess whether it is large
// because it has many documents, or because it is a small number of large documents)
try {
if (Files.size(path) > 10_000_000) {
return CrawledDocumentParquetRecordFileReader.countGoodStatusCodes(path);
}
} catch (IOException e) {
// suppressed
}
return 0;
}
@Override
@SneakyThrows
public boolean hasNext() {
@ -46,7 +62,13 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
createDomainRecord(nextRecord);
wroteDomainRecord = true;
}
createDocumentRecord(nextRecord);
try {
createDocumentRecord(nextRecord);
}
catch (Exception ex) {
logger.error("Failed to create document record", ex);
}
}
return !nextQ.isEmpty();
}
@ -94,7 +116,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
return;
}
else {
else if (nextRecord.body != null) {
try {
bodyString = DocumentBodyToString.getStringData(
ContentType.parse(nextRecord.contentType),
@ -104,6 +126,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
status = CrawlerDocumentStatus.BAD_CHARSET;
}
}
else {
status = CrawlerDocumentStatus.ERROR;
}
nextQ.add(new CrawledDocument("",
nextRecord.url,

View File

@ -1,11 +1,13 @@
package nu.marginalia.crawling.parquet;
import blue.strategic.parquet.Hydrator;
import blue.strategic.parquet.HydratorSupplier;
import blue.strategic.parquet.ParquetReader;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Stream;
public class CrawledDocumentParquetRecordFileReader {
@ -16,4 +18,25 @@ public class CrawledDocumentParquetRecordFileReader {
HydratorSupplier.constantly(CrawledDocumentParquetRecord.newHydrator()));
}
/** Count the number of documents with a 200 status code */
public static int countGoodStatusCodes(Path path) throws IOException {
return (int) ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(new Hydrator<Integer, Integer>() {
@Override
public Integer start() { return 0; }
@Override
public Integer add(Integer target, String heading, Object value) {
if ("statusCode".equals(heading) && Integer.valueOf(200).equals(value)) {
return 1;
}
return 0;
}
@Override
public Integer finish(Integer target) { return target; }
}),
List.of("statusCode"))
.mapToInt(Integer::valueOf)
.count();
}
}

View File

@ -95,7 +95,7 @@ public class CrawlPlan {
}
try {
return Optional.of(CrawledDomainReader.createDataStream(path));
return Optional.of(CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.COMPATIBLE, path));
}
catch (IOException ex) {
return Optional.empty();

View File

@ -6,9 +6,9 @@ import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloadSourceFactory;
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.converting.writer.ConverterWriter;
import nu.marginalia.storage.FileStorageService;
@ -46,7 +46,6 @@ public class ConverterMain {
private final MessageQueueFactory messageQueueFactory;
private final FileStorageService fileStorageService;
private final SideloadSourceFactory sideloadSourceFactory;
private final int node;
public static void main(String... args) throws Exception {
@ -109,7 +108,7 @@ public class ConverterMain {
taskHeartbeat.progress(sideloadSource.domainName(), i++, sideloadSources.size());
writer.write(sideloadSource);
writer.writeSideloadSource(sideloadSource);
}
taskHeartbeat.progress("Finished", i, sideloadSources.size());
@ -139,8 +138,8 @@ public class ConverterMain {
{
pool.submit(() -> {
try {
ProcessedDomain processed = processor.process(domain);
converterWriter.accept(processed);
ConverterBatchWritableIf writable = processor.createWritable(domain);
converterWriter.accept(writable);
}
catch (Exception ex) {
logger.info("Error in processing", ex);

View File

@ -1,15 +1,18 @@
package nu.marginalia.converting.model;
import lombok.ToString;
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import org.jetbrains.annotations.Nullable;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
@ToString
public class ProcessedDomain {
public class ProcessedDomain implements ConverterBatchWritableIf {
public EdgeDomain domain;
public List<ProcessedDocument> documents;
@ -26,4 +29,17 @@ public class ProcessedDomain {
public int size() {
return Optional.ofNullable(documents).map(List::size).orElse(1);
}
@Override
public void write(ConverterBatchWriter writer) throws IOException {
writer.writeProcessedDomain(this);
}
@Override
public String id() {
return domain.toString();
}
@Override
public void close() {}
}

View File

@ -0,0 +1,31 @@
package nu.marginalia.converting.processor;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.model.ProcessedDocument;
import java.util.HashSet;
import java.util.Set;
public class DocumentDecorator {
private final Set<String> extraSearchTerms = new HashSet<>();
private final AnchorTextKeywords keywords;
public DocumentDecorator(AnchorTextKeywords keywords) {
this.keywords = keywords;
}
public void addTerm(String term) {
extraSearchTerms.add(term);
}
public void apply(ProcessedDocument doc, DomainLinks externalDomainLinks) {
if (doc == null)
return;
if (doc.words == null)
return;
doc.words.addAllSyntheticTerms(extraSearchTerms);
doc.words.addAnchorTerms(keywords.getAnchorTextKeywords(externalDomainLinks, doc.url));
}
}

View File

@ -4,6 +4,8 @@ import com.google.inject.Inject;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.model.ProcessedDocument;
@ -38,20 +40,29 @@ public class DocumentProcessor {
processorPlugins.add(plainTextDocumentProcessorPlugin);
}
public ProcessedDocument process(CrawledDocument crawledDocument, DomainLinks externalDomainLinks) {
public ProcessedDocument process(CrawledDocument crawledDocument,
EdgeDomain domain,
DomainLinks externalDomainLinks,
DocumentDecorator documentDecorator) {
ProcessedDocument ret = new ProcessedDocument();
try {
// We must always provide the URL, even if we don't process the document
ret.url = getDocumentUrl(crawledDocument);
if (!Objects.equals(ret.url.domain, domain)) {
ret.state = UrlIndexingState.DISQUALIFIED;
ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString();
return ret;
}
DocumentClass documentClass = switch (externalDomainLinks.countForUrl(ret.url)) {
case 0 -> DocumentClass.NORMAL;
case 1 -> DocumentClass.EXTERNALLY_LINKED_ONCE;
default -> DocumentClass.EXTERNALLY_LINKED_MULTI;
};
processDocument(crawledDocument, documentClass, ret);
processDocument(crawledDocument, documentClass, documentDecorator, externalDomainLinks, ret);
}
catch (DisqualifiedException ex) {
ret.state = UrlIndexingState.DISQUALIFIED;
@ -67,7 +78,7 @@ public class DocumentProcessor {
return ret;
}
private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, DocumentDecorator documentDecorator, DomainLinks externalDomainLinks, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
if (crawlerStatus != CrawlerDocumentStatus.OK) {
@ -90,6 +101,16 @@ public class DocumentProcessor {
ret.details = detailsWithWords.details();
ret.words = detailsWithWords.words();
documentDecorator.apply(ret, externalDomainLinks);
if (Boolean.TRUE.equals(crawledDocument.hasCookies)
&& ret.details != null
&& ret.details.features != null)
{
ret.details.features.add(HtmlFeature.COOKIES);
}
}
private AbstractDocumentProcessorPlugin findPlugin(CrawledDocument crawledDocument) throws DisqualifiedException {

View File

@ -8,6 +8,9 @@ import nu.marginalia.atags.source.AnchorTagsSource;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.logic.links.LinkGraph;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.*;
import nu.marginalia.geoip.GeoIpDictionary;
@ -17,12 +20,13 @@ import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.converting.processor.logic.links.TopKeywords;
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.util.ProcessingIterator;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.*;
import java.util.regex.Pattern;
@ -32,7 +36,6 @@ public class DomainProcessor {
private final SiteWords siteWords;
private final AnchorTagsSource anchorTagsSource;
private final AnchorTextKeywords anchorTextKeywords;
private final LshDocumentDeduplicator documentDeduplicator;
private final GeoIpDictionary geoIpDictionary;
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -42,78 +45,161 @@ public class DomainProcessor {
SiteWords siteWords,
AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords,
LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException
GeoIpDictionary geoIpDictionary) throws SQLException
{
this.documentProcessor = documentProcessor;
this.siteWords = siteWords;
this.anchorTextKeywords = anchorTextKeywords;
this.documentDeduplicator = documentDeduplicator;
this.anchorTagsSource = anchorTagsSourceFactory.create();
this.geoIpDictionary = geoIpDictionary;
geoIpDictionary.waitReady();
}
public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) {
final int sizeHint = domain.sizeHint();
if (sizeHint > 10_000) {
// If the file is too big, we run a processing mode that doesn't
// require loading the entire dataset into RAM
return sideloadProcessing(domain, sizeHint);
}
return fullProcessing(domain);
}
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) {
try {
return new SideloadProcessing(dataStream, sizeHint);
}
catch (Exception ex) {
logger.warn("Failed to process domain sideload", ex);
return null;
}
}
public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
private final SerializableCrawlDataStream dataStream;
private final ProcessedDomain domain;
private final DocumentDecorator documentDecorator;
private final Set<String> processedUrls = new HashSet<>();
private final DomainLinks externalDomainLinks;
private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator();
private static final ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(8,
Integer.getInteger("java.util.concurrent.ForkJoinPool.common.parallelism", Runtime.getRuntime().availableProcessors())
);
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException {
this.dataStream = dataStream;
if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain))
{
throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName());
}
domain = new ProcessedDomain();
domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
documentDecorator = new DocumentDecorator(anchorTextKeywords);
processDomain(crawledDomain, domain, documentDecorator);
externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain);
}
@Override
public ProcessedDomain getDomain() {
return domain;
}
@Override
public Iterator<ProcessedDocument> getDocumentsStream() {
return iteratorFactory.create((taskConsumer) -> {
while (dataStream.hasNext())
{
if (!(dataStream.next() instanceof CrawledDocument doc))
continue;
if (doc.url == null || !processedUrls.add(doc.url))
continue;
taskConsumer.accept(() -> {
var processedDoc = documentProcessor.process(doc, domain.domain, externalDomainLinks, documentDecorator);
synchronized (deduplicator) {
deduplicator.markIfDuplicate(processedDoc);
}
if (processedDoc.isProcessedFully()) {
// This is a bit sketchy, but we need to set the size and topology to something
processedDoc.details.metadata = processedDoc.details.metadata.withSizeAndTopology(
10_000, externalDomainLinks.countForUrl(processedDoc.url));
}
return processedDoc;
});
}
});
}
@Override
public void write(ConverterBatchWriter writer) throws IOException {
writer.writeSideloadSource(this);
}
@Override
public String id() {
return domain.domain.toString();
}
@Override
public void close() throws Exception {
dataStream.close();
deduplicator.close();
}
}
@SneakyThrows
@Nullable
public ProcessedDomain process(SerializableCrawlDataStream dataStream) {
public ProcessedDomain fullProcessing(SerializableCrawlDataStream dataStream) {
if (!dataStream.hasNext()) {
return null;
}
var ret = new ProcessedDomain();
List<ProcessedDocument> docs = new ArrayList<>();
Set<String> processedUrls = new HashSet<>();
boolean cookies = false;
String ip = "";
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName());
}
DomainLinks externalDomainLinks = null;
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(crawledDomain.getDomain());
DocumentDecorator documentDecorator = new DocumentDecorator(anchorTextKeywords);
while (dataStream.hasNext()) {
var data = dataStream.next();
// Process Domain Record
// Do a lazy load of the external domain links since we don't know the domain
// until we see the first document
if (externalDomainLinks == null) {
var domain = data.getDomain();
ProcessedDomain ret = new ProcessedDomain();
processDomain(crawledDomain, ret, documentDecorator);
ret.documents = docs;
if (domain != null) {
externalDomainLinks = anchorTagsSource.getAnchorTags(domain);
}
}
// Process Documents
if (data instanceof CrawledDomain crawledDomain) {
ret.domain = new EdgeDomain(crawledDomain.domain);
ret.ip = crawledDomain.ip;
try (var deduplicator = new LshDocumentDeduplicator()) {
while (dataStream.hasNext()) {
if (!(dataStream.next() instanceof CrawledDocument doc))
continue;
if (doc.url == null)
continue;
if (!processedUrls.add(doc.url))
continue;
cookies = crawledDomain.hasCookies();
ip = crawledDomain.ip;
if (crawledDomain.redirectDomain != null) {
ret.redirect = new EdgeDomain(crawledDomain.redirectDomain);
}
ret.documents = docs;
ret.state = getState(crawledDomain.crawlerStatus);
}
else if (data instanceof CrawledDocument doc) {
try {
if (doc.url == null || !processedUrls.add(doc.url))
continue;
if (Boolean.TRUE.equals(doc.hasCookies)) {
cookies = true;
}
// This case should never be reachable, as we should have initiated
// the externalDomainLinks variable above if we made it past the
// doc.url == null check; but we'll leave it here just in case
// to make debugging easier if we break this.
assert externalDomainLinks != null : "externalDomainLinks has not been initialized";
docs.add(documentProcessor.process(doc, externalDomainLinks));
}
catch (Exception ex) {
var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator);
deduplicator.markIfDuplicate(processedDoc);
docs.add(processedDoc);
} catch (Exception ex) {
logger.warn("Failed to process " + doc.url, ex);
}
}
@ -121,57 +207,50 @@ public class DomainProcessor {
// Add late keywords and features from domain-level information
List<String> terms = new ArrayList<>();
addIpInfo(terms, ip);
if (cookies) {
terms.add(HtmlFeature.COOKIES.getKeyword());
}
if (isAcademicDomain(ret.domain)) {
terms.add("special:academia");
}
for (var document : ret.documents) {
if (document.details == null)
continue;
if (cookies) {
document.details.features.add(HtmlFeature.COOKIES);
}
document.words.addAllSyntheticTerms(terms);
document.words.addAnchorTerms(
anchorTextKeywords.getAnchorTextKeywords(externalDomainLinks, document.url)
);
}
documentDeduplicator.deduplicate(ret.documents);
calculateStatistics(ret, externalDomainLinks);
return ret;
}
private void addIpInfo(List<String> terms, String ip) {
terms.add("ip:"+ip);
private void processDomain(CrawledDomain crawledDomain,
ProcessedDomain domain,
DocumentDecorator decorator)
{
domain.domain = new EdgeDomain(crawledDomain.domain);
domain.ip = crawledDomain.ip;
addIpInfo(decorator, crawledDomain.ip);
if (isAcademicDomain(domain.domain)) {
decorator.addTerm("special:academia");
}
if (crawledDomain.redirectDomain != null) {
domain.redirect = new EdgeDomain(crawledDomain.redirectDomain);
}
domain.state = getState(crawledDomain.crawlerStatus);
}
private void addIpInfo(DocumentDecorator decorator, String ip) {
decorator.addTerm("ip:"+ip);
// Add IP location country as a term
String country = geoIpDictionary.getCountry(ip);
if (!country.isBlank()) { // use the ip:-prefix as there's no real confusion between e.g. ip:127.0.0.1 and ip:uk
terms.add("ip:"+country.toLowerCase());
decorator.addTerm("ip:"+country.toLowerCase());
}
// Add ASN as a term
geoIpDictionary.getAsnInfo(ip).ifPresent(asnInfo -> {
terms.add("as:"+asnInfo.asn());
decorator.addTerm("as:"+asnInfo.asn());
for (var orgPart : StringUtils.split(asnInfo.org(), '-')) {
terms.add("as:"+orgPart.toLowerCase());
decorator.addTerm("as:"+orgPart.toLowerCase());
}
if (isCloudy(asnInfo)) {
terms.add("special:cloud");
decorator.addTerm("special:cloud");
}
});
@ -251,4 +330,5 @@ public class DomainProcessor {
};
}
}

View File

@ -1,74 +1,43 @@
package nu.marginalia.converting.processor.logic;
import com.google.inject.Singleton;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.lsh.EasyLSH;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
/** Deduplicates documents based on their LSH
*
* @see EasyLSH
*/
@Singleton
public class LshDocumentDeduplicator {
public class LshDocumentDeduplicator implements AutoCloseable {
private final int DISTANCE_THRESHOLD = 2;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final TLongArrayList hashCodes = new TLongArrayList(1000);
private static final int DISTANCE_THRESHOLD = 2;
public void deduplicate(List<ProcessedDocument> documents) {
ProcessedDocument[] goodDocuments = documents.stream()
.filter(ProcessedDocument::isProcessedFully)
.filter(doc -> doc.words.size() > 100)
.toArray(ProcessedDocument[]::new);
long[] hashCodes = new long[goodDocuments.length];
for (int i = 0; i < goodDocuments.length; i++) {
hashCodes[i] = goodDocuments[i].details.hashCode;
public void markIfDuplicate(ProcessedDocument document) {
if (!document.isProcessedFully()) {
return;
}
// These arrays can be fairly large (~10,000) so we need to be
// careful about what we do in this O(n^2) loop
if (document.words.size() < 100) {
return;
}
for (int i = 0; i < hashCodes.length; i++) {
for (int j = 0; j < hashCodes.length; j++) {
// This is basically just a 64 bit XOR and a POPCOUNT so it's pretty fast.
if (EasyLSH.hammingDistance(hashCodes[i], hashCodes[j]) < DISTANCE_THRESHOLD) {
if (i == j)
continue;
long hashCode = document.details.hashCode;
if (flagIfDuplicate(goodDocuments[i], goodDocuments[j])) {
break;
}
}
for (int i = 0; i < hashCodes.size(); i++) {
if (EasyLSH.hammingDistance(hashCode, hashCodes.get(i)) < DISTANCE_THRESHOLD) {
document.state = UrlIndexingState.DISQUALIFIED;
document.stateReason = "Duplicate";
return;
}
}
hashCodes.add(hashCode);
}
private boolean flagIfDuplicate(ProcessedDocument thisDoc, ProcessedDocument otherDoc) {
// This document has already been disqualified as a duplicate
if (thisDoc.state != UrlIndexingState.OK)
return false;
// We might consider using thisDoc.details.metadata.topology() here instead of the
// URL length to determine which document is the "better" one.
if (thisDoc.url.path.length()
< otherDoc.url.path.length())
{
logger.debug("{} duplicates {}", otherDoc.url, thisDoc.url);
otherDoc.state = UrlIndexingState.DISQUALIFIED;
otherDoc.stateReason = "Duplicate";
return true;
}
return false;
@Override
public void close() throws Exception {
hashCodes.clear(1);
}
}

View File

@ -76,7 +76,8 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
@SneakyThrows
@Override
public Iterator<ProcessedDocument> getDocumentsStream() {
return new ProcessingIterator<>(24, 16, (taskConsumer) -> {
// This leaks a thread pool, but it doesn't matter since this is a one-off process
return ProcessingIterator.factory(24, 16).create((taskConsumer) -> {
DomainLinks domainLinks = getDomainLinks();
var stmt = connection.prepareStatement("""

View File

@ -0,0 +1,9 @@
package nu.marginalia.converting.writer;
import java.io.IOException;
public interface ConverterBatchWritableIf {
void write(ConverterBatchWriter writer) throws IOException;
String id();
void close() throws Exception;
}

View File

@ -27,7 +27,7 @@ import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.Future;
/** Writer for a single batch of converter parquet files */
public class ConverterBatchWriter implements AutoCloseable {
public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriterIf {
private final DomainRecordParquetFileWriter domainWriter;
private final DomainLinkRecordParquetFileWriter domainLinkWriter;
private final DocumentRecordParquetFileWriter documentWriter;
@ -46,7 +46,13 @@ public class ConverterBatchWriter implements AutoCloseable {
);
}
public void write(SideloadSource sideloadSource) throws IOException {
@Override
public void write(ConverterBatchWritableIf writable) throws IOException {
writable.write(this);
}
@Override
public void writeSideloadSource(SideloadSource sideloadSource) throws IOException {
var domain = sideloadSource.getDomain();
writeDomainData(domain);
@ -54,7 +60,8 @@ public class ConverterBatchWriter implements AutoCloseable {
writeDocumentData(domain.domain, sideloadSource.getDocumentsStream());
}
public void write(ProcessedDomain domain) {
@Override
public void writeProcessedDomain(ProcessedDomain domain) {
var results = ForkJoinPool.commonPool().invokeAll(
writeTasks(domain)
);
@ -180,7 +187,7 @@ public class ConverterBatchWriter implements AutoCloseable {
return this;
}
private Object writeDomainData(ProcessedDomain domain) throws IOException {
public Object writeDomainData(ProcessedDomain domain) throws IOException {
DomainMetadata metadata = DomainMetadata.from(domain);
List<String> feeds = getFeedUrls(domain);

View File

@ -0,0 +1,15 @@
package nu.marginalia.converting.writer;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
import java.io.IOException;
public interface ConverterBatchWriterIf {
void write(ConverterBatchWritableIf writable) throws IOException;
void writeSideloadSource(SideloadSource sideloadSource) throws IOException;
void writeProcessedDomain(ProcessedDomain domain);
}

View File

@ -24,7 +24,7 @@ public class ConverterWriter implements AutoCloseable {
private final Duration switchInterval
= Duration.of(10, ChronoUnit.MINUTES);
private final ArrayBlockingQueue<ProcessedDomain> domainData
private final ArrayBlockingQueue<ConverterBatchWritableIf> domainData
= new ArrayBlockingQueue<>(1);
private final Thread workerThread;
@ -42,7 +42,7 @@ public class ConverterWriter implements AutoCloseable {
}
@SneakyThrows
public void accept(@Nullable ProcessedDomain domain) {
public void accept(@Nullable ConverterBatchWritableIf domain) {
if (null == domain)
return;
@ -66,10 +66,11 @@ public class ConverterWriter implements AutoCloseable {
if (data == null)
continue;
String id = data.domain.toString();
String id = data.id();
if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) {
logger.warn("Skipping already logged item {}", id);
data.close();
continue;
}

View File

@ -43,7 +43,7 @@ public class ConvertingIntegrationTest {
var domain = new CrawledDomain("memex.marginalia.nu", null, "OK", "-", "127.0.0.1",
docs, Collections.emptyList());
var ret = domainProcessor.process(asSerializableCrawlData(domain));
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(domain));
assertEquals(ret.state, DomainIndexingState.ACTIVE);
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
@ -51,7 +51,7 @@ public class ConvertingIntegrationTest {
}
@Test
public void testMemexMarginaliaNuDateInternalConsistency() throws IOException {
var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet()));
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
ret.documents.stream().filter(ProcessedDocument::isProcessedFully).forEach(doc -> {
int year = PubDate.fromYearByte(doc.details.metadata.year());
Integer yearMeta = doc.details.pubYear;
@ -63,8 +63,8 @@ public class ConvertingIntegrationTest {
}
@Test
public void testMemexMarginaliaNu() throws IOException {
var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet()));
public void testMemexMarginaliaNuFullProcessing() throws IOException {
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
assertNotNull(ret);
assertEquals(ret.state, DomainIndexingState.ACTIVE);
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
@ -94,6 +94,39 @@ public class ConvertingIntegrationTest {
}
}
@Test
public void testMemexMarginaliaNuSideloadProcessing() throws IOException {
var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()), 100);
assertNotNull(ret);
assertEquals("memex.marginalia.nu", ret.id());
var domain = ret.getDomain();
assertEquals(domain.domain, new EdgeDomain("memex.marginalia.nu"));
List<ProcessedDocument> docsAll = new ArrayList<>();
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
ret.getDocumentsStream().forEachRemaining(docsAll::add);
assertTrue(docsAll.size() > 25);
docsAll.forEach(doc -> resultsByStatusCount.merge(doc.state, 1, Integer::sum));
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
for (var doc : docsAll) {
if (!doc.isProcessedFully()) {
continue;
}
var details = doc.details;
assertTrue(details.metadata.size() > 0);
assertTrue(details.title.length() > 4);
assertTrue(details.description.length() > 4);
assertEquals(HtmlStandard.HTML5, details.standard);
}
}
private CrawledDomain readMarginaliaWorkingSet() throws IOException {
String index = readClassPathFile("memex-marginalia/index");
String[] files = index.split("\n");
@ -139,10 +172,13 @@ public class ConvertingIntegrationTest {
private SerializableCrawlDataStream asSerializableCrawlData(CrawledDomain domain) {
List<SerializableCrawlData> data = new ArrayList<>();
data.add(domain);
if (domain.doc != null) {
data.addAll(domain.doc);
}
data.add(domain);
return SerializableCrawlDataStream.fromIterator(data.iterator());
}

View File

@ -251,7 +251,7 @@ public class CrawlingThenConvertingIntegrationTest {
private ProcessedDomain process() {
try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
return domainProcessor.process(stream);
return domainProcessor.fullProcessing(stream);
}
catch (Exception e) {
Assertions.fail(e);

View File

@ -41,6 +41,7 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.security.Security;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.*;
@ -100,11 +101,15 @@ public class CrawlerMain {
}
public static void main(String... args) throws Exception {
if (!AbortMonitor.getInstance().isAlive()) {
System.err.println("Remove abort file first");
return;
}
// Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
Security.setProperty("networkaddress.cache.ttl" , "3600");
// This must run *early*
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
@ -267,7 +272,7 @@ public class CrawlerMain {
private CrawlDataReference getReference() {
try {
return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id));
return new CrawlDataReference(CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, outputDir, domain, id));
} catch (IOException e) {
logger.debug("Failed to read previous crawl data for {}", specification.domain);
return new CrawlDataReference();

View File

@ -26,7 +26,7 @@ public class WarcProtocolReconstructor {
requestStringBuilder.append(request.method()).append(" ").append(encodedURL);
if (uri.getQuery() != null) {
requestStringBuilder.append("?").append(uri.getQuery());
requestStringBuilder.append("?").append(URLEncoder.encode(uri.getQuery(), StandardCharsets.UTF_8));
}
requestStringBuilder.append(" HTTP/1.1\r\n");
requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n");

View File

@ -182,7 +182,7 @@ class CrawlerRetreiverTest {
convertToParquet(tempFileWarc1, tempFileParquet1);
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) {
while (stream.hasNext()) {
if (stream.next() instanceof CrawledDocument doc) {
data.add(doc);
@ -227,7 +227,7 @@ class CrawlerRetreiverTest {
doCrawl(tempFileWarc1, specs);
convertToParquet(tempFileWarc1, tempFileParquet1);
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) {
while (stream.hasNext()) {
if (stream.next() instanceof CrawledDocument doc) {
data.add(doc);
@ -272,8 +272,9 @@ class CrawlerRetreiverTest {
tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz");
doCrawl(tempFileWarc1, specs);
convertToParquet(tempFileWarc1, tempFileParquet1);
doCrawlWithReferenceStream(specs,
CrawledDomainReader.createDataStream(tempFileParquet1)
CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)
);
convertToParquet(tempFileWarc2, tempFileParquet2);
@ -294,7 +295,7 @@ class CrawlerRetreiverTest {
});
}
try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
try (var ds = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet2)) {
while (ds.hasNext()) {
var doc = ds.next();
if (doc instanceof CrawledDomain dr) {
@ -337,7 +338,7 @@ class CrawlerRetreiverTest {
convertToParquet(tempFileWarc1, tempFileParquet1);
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) {
while (stream.hasNext()) {
var doc = stream.next();
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
@ -346,7 +347,7 @@ class CrawlerRetreiverTest {
throw new RuntimeException(e);
}
var stream = CrawledDomainReader.createDataStream(tempFileParquet1);
var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1);
System.out.println("---");
@ -386,7 +387,7 @@ class CrawlerRetreiverTest {
});
}
try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
try (var ds = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet2)) {
while (ds.hasNext()) {
var doc = ds.next();
if (doc instanceof CrawledDomain dr) {

View File

@ -9,7 +9,7 @@ import lombok.SneakyThrows;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.linkdb.LinkdbWriter;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.loading.documents.DocumentLoaderService;
import nu.marginalia.loading.documents.KeywordLoaderService;
import nu.marginalia.loading.domains.DomainIdRegistry;
@ -43,7 +43,7 @@ public class LoaderMain {
private final ProcessHeartbeatImpl heartbeat;
private final MessageQueueFactory messageQueueFactory;
private final FileStorageService fileStorageService;
private final LinkdbWriter linkdbWriter;
private final DocumentDbWriter documentDbWriter;
private final LoaderIndexJournalWriter journalWriter;
private final DomainLoaderService domainService;
private final DomainLinksLoaderService linksService;
@ -77,7 +77,7 @@ public class LoaderMain {
public LoaderMain(ProcessHeartbeatImpl heartbeat,
MessageQueueFactory messageQueueFactory,
FileStorageService fileStorageService,
LinkdbWriter linkdbWriter,
DocumentDbWriter documentDbWriter,
LoaderIndexJournalWriter journalWriter,
DomainLoaderService domainService,
DomainLinksLoaderService linksService,
@ -90,7 +90,7 @@ public class LoaderMain {
this.heartbeat = heartbeat;
this.messageQueueFactory = messageQueueFactory;
this.fileStorageService = fileStorageService;
this.linkdbWriter = linkdbWriter;
this.documentDbWriter = documentDbWriter;
this.journalWriter = journalWriter;
this.domainService = domainService;
this.linksService = linksService;
@ -132,7 +132,7 @@ public class LoaderMain {
}
finally {
journalWriter.close();
linkdbWriter.close();
documentDbWriter.close();
heartbeat.shutDown();
}

View File

@ -9,8 +9,9 @@ import com.google.inject.name.Names;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import nu.marginalia.IndexLocations;
import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.linkdb.LinkdbWriter;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.service.SearchServiceDescriptors;
import nu.marginalia.service.descriptor.ServiceDescriptors;
@ -20,6 +21,9 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
import static nu.marginalia.linkdb.LinkdbFileNames.DOMAIN_LINKS_FILE_NAME;
public class LoaderModule extends AbstractModule {
public LoaderModule() {
@ -34,14 +38,26 @@ public class LoaderModule extends AbstractModule {
}
@Inject @Provides @Singleton
private LinkdbWriter createLinkdbWriter(FileStorageService service) throws SQLException, IOException {
Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve("links.db");
private DocumentDbWriter createLinkdbWriter(FileStorageService service) throws SQLException, IOException {
// Migrate
Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOCDB_FILE_NAME);
if (Files.exists(dbPath)) {
Files.delete(dbPath);
}
return new LinkdbWriter(dbPath);
return new DocumentDbWriter(dbPath);
}
@Inject @Provides @Singleton
private DomainLinkDbWriter createDomainLinkdbWriter(FileStorageService service) throws SQLException, IOException {
Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOMAIN_LINKS_FILE_NAME);
if (Files.exists(dbPath)) {
Files.delete(dbPath);
}
return new DomainLinkDbWriter(dbPath);
}
private Gson createGson() {

View File

@ -4,9 +4,8 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import lombok.SneakyThrows;
import nu.marginalia.io.processed.DocumentRecordParquetFileReader;
import nu.marginalia.io.processed.ProcessedDataFileNames;
import nu.marginalia.linkdb.LinkdbWriter;
import nu.marginalia.linkdb.model.LdbUrlDetail;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.loading.LoaderInputData;
import nu.marginalia.loading.domains.DomainIdRegistry;
import nu.marginalia.model.EdgeUrl;
@ -26,11 +25,11 @@ import java.util.List;
public class DocumentLoaderService {
private static final Logger logger = LoggerFactory.getLogger(DocumentLoaderService.class);
private final LinkdbWriter linkdbWriter;
private final DocumentDbWriter documentDbWriter;
@Inject
public DocumentLoaderService(LinkdbWriter linkdbWriter) {
this.linkdbWriter = linkdbWriter;
public DocumentLoaderService(DocumentDbWriter documentDbWriter) {
this.documentDbWriter = documentDbWriter;
}
public boolean loadDocuments(
@ -73,7 +72,7 @@ public class DocumentLoaderService {
class LinkdbLoader implements AutoCloseable {
private final DomainIdRegistry domainIdRegistry;
private final List<LdbUrlDetail> details = new ArrayList<>(1000);
private final List<DocdbUrlDetail> details = new ArrayList<>(1000);
LinkdbLoader(DomainIdRegistry domainIdRegistry) {
this.domainIdRegistry = domainIdRegistry;
@ -88,7 +87,7 @@ public class DocumentLoaderService {
projection.ordinal
);
details.add(new LdbUrlDetail(
details.add(new DocdbUrlDetail(
urlId,
new EdgeUrl(projection.url),
projection.title,
@ -102,7 +101,7 @@ public class DocumentLoaderService {
));
if (details.size() > 100) {
linkdbWriter.add(details);
documentDbWriter.add(details);
details.clear();
}
@ -111,7 +110,7 @@ public class DocumentLoaderService {
@Override
public void close() throws SQLException {
if (!details.isEmpty()) {
linkdbWriter.add(details);
documentDbWriter.add(details);
}
}
}

View File

@ -2,10 +2,9 @@ package nu.marginalia.loading.links;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.ProcessConfiguration;
import lombok.SneakyThrows;
import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader;
import nu.marginalia.io.processed.ProcessedDataFileNames;
import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter;
import nu.marginalia.loading.LoaderInputData;
import nu.marginalia.loading.domains.DomainIdRegistry;
import nu.marginalia.model.processed.DomainLinkRecord;
@ -15,28 +14,22 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
@Singleton
public class DomainLinksLoaderService {
private final HikariDataSource dataSource;
private static final Logger logger = LoggerFactory.getLogger(DomainLinksLoaderService.class);
private final int nodeId;
private final DomainLinkDbWriter domainLinkDbWriter;
@Inject
public DomainLinksLoaderService(HikariDataSource dataSource,
ProcessConfiguration processConfiguration) {
this.dataSource = dataSource;
this.nodeId = processConfiguration.node();
public DomainLinksLoaderService(DomainLinkDbWriter domainLinkDbWriter) {
this.domainLinkDbWriter = domainLinkDbWriter;
}
public boolean loadLinks(DomainIdRegistry domainIdRegistry,
ProcessHeartbeat heartbeat,
LoaderInputData inputData) throws IOException, SQLException {
dropLinkData();
LoaderInputData inputData) throws IOException {
try (var task = heartbeat.createAdHocTaskHeartbeat("LINKS")) {
var linkFiles = inputData.listDomainLinkFiles();
@ -56,17 +49,7 @@ public class DomainLinksLoaderService {
return true;
}
private void dropLinkData() throws SQLException {
logger.info("Clearing EC_DOMAIN_LINK");
try (var conn = dataSource.getConnection();
var call = conn.prepareCall("CALL PURGE_LINKS_TABLE(?)")) {
call.setInt(1, nodeId);
call.executeUpdate();
}
}
private void loadLinksFromFile(DomainIdRegistry domainIdRegistry, Path file) throws IOException, SQLException {
private void loadLinksFromFile(DomainIdRegistry domainIdRegistry, Path file) throws IOException {
try (var domainStream = DomainLinkRecordParquetFileReader.stream(file);
var linkLoader = new LinkLoader(domainIdRegistry))
{
@ -76,49 +59,21 @@ public class DomainLinksLoaderService {
}
class LinkLoader implements AutoCloseable {
private final Connection connection;
private final PreparedStatement insertStatement;
private final DomainIdRegistry domainIdRegistry;
private int batchSize = 0;
private int total = 0;
public LinkLoader(DomainIdRegistry domainIdRegistry) throws SQLException {
public LinkLoader(DomainIdRegistry domainIdRegistry) {
this.domainIdRegistry = domainIdRegistry;
connection = dataSource.getConnection();
insertStatement = connection.prepareStatement("""
INSERT IGNORE INTO EC_DOMAIN_LINK(SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
VALUES (?, ?)
""");
}
@SneakyThrows
void accept(DomainLinkRecord record) {
try {
insertStatement.setInt(1, domainIdRegistry.getDomainId(record.source));
insertStatement.setInt(2, domainIdRegistry.getDomainId(record.dest));
insertStatement.addBatch();
if (++batchSize > 1000) {
batchSize = 0;
insertStatement.executeBatch();
}
total++;
}
catch (SQLException ex) {
throw new RuntimeException(ex);
}
domainLinkDbWriter.write(
domainIdRegistry.getDomainId(record.source),
domainIdRegistry.getDomainId(record.dest)
);
}
@Override
public void close() throws SQLException {
if (batchSize > 0) {
insertStatement.executeBatch();
}
logger.info("Inserted {} links", total);
insertStatement.close();
connection.close();
}
public void close() {}
}
}

View File

@ -1,176 +0,0 @@
package nu.marginalia.loading.links;
import com.google.common.collect.Lists;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter;
import nu.marginalia.io.processed.DomainRecordParquetFileWriter;
import nu.marginalia.io.processed.ProcessedDataFileNames;
import nu.marginalia.loader.DbTestUtil;
import nu.marginalia.loading.LoaderInputData;
import nu.marginalia.loading.domains.DomainLoaderService;
import nu.marginalia.model.processed.DomainLinkRecord;
import nu.marginalia.model.processed.DomainRecord;
import nu.marginalia.process.control.ProcessAdHocTaskHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import org.junit.jupiter.api.*;
import org.mockito.Mockito;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("slow")
@Testcontainers
@Disabled // Error in the SQL loading mechanism, we don't deal with DELIMITER correctly
// which means we can't get around flyway's bugs necessitating DELIMITER.
class DomainLinksLoaderServiceTest {
List<Path> toDelete = new ArrayList<>();
ProcessHeartbeat heartbeat;
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withInitScript("db/migration/V23_06_0_000__base.sql")
.withNetworkAliases("mariadb");
HikariDataSource dataSource;
@BeforeEach
public void setUp() {
HikariConfig config = new HikariConfig();
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
config.setUsername("wmsa");
config.setPassword("wmsa");
dataSource = new HikariDataSource(config);
List<String> migrations = List.of(
"db/migration/V23_11_0_007__domain_node_affinity.sql",
"db/migration/V23_11_0_008__purge_procedure.sql"
);
for (String migration : migrations) {
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(migration),
"Could not load migration script " + migration);
var conn = dataSource.getConnection();
var stmt = conn.createStatement()
) {
String script = new String(resource.readAllBytes());
String[] cmds = script.split("\\s*;\\s*");
for (String cmd : cmds) {
if (cmd.isBlank())
continue;
System.out.println(cmd);
stmt.executeUpdate(cmd);
}
} catch (IOException | SQLException ex) {
}
}
heartbeat = Mockito.mock(ProcessHeartbeat.class);
Mockito.when(heartbeat.createAdHocTaskHeartbeat(Mockito.anyString())).thenReturn(
Mockito.mock(ProcessAdHocTaskHeartbeat.class)
);
}
@AfterEach
public void tearDown() throws IOException {
for (var path : Lists.reverse(toDelete)) {
Files.deleteIfExists(path);
}
toDelete.clear();
dataSource.close();
}
@Test
public void test() throws IOException, SQLException {
Path workDir = Files.createTempDirectory(getClass().getSimpleName());
Path parquetFile1 = ProcessedDataFileNames.domainFileName(workDir, 0);
Path parquetFile2 = ProcessedDataFileNames.domainLinkFileName(workDir, 0);
Path parquetFile3 = ProcessedDataFileNames.domainLinkFileName(workDir, 1);
toDelete.add(workDir);
toDelete.add(parquetFile1);
toDelete.add(parquetFile2);
toDelete.add(parquetFile3);
List<String> domains1 = List.of("www.marginalia.nu", "search.marginalia.nu");
List<String> linkDomains1 = List.of("wiby.me", "www.mojeek.com", "www.altavista.com");
List<String> linkDomains2 = List.of("maya.land", "xkcd.com", "aaronsw.com");
try (var pw = new DomainRecordParquetFileWriter(parquetFile1)) {
for (var domain : domains1) {
pw.write(dr(domain));
}
}
try (var pw = new DomainLinkRecordParquetFileWriter(parquetFile2)) {
for (var domain : linkDomains1) {
pw.write(dl("www.marginalia.nu", domain));
}
}
try (var pw = new DomainLinkRecordParquetFileWriter(parquetFile3)) {
for (var domain : linkDomains2) {
pw.write(dl("search.marginalia.nu", domain));
}
}
try (var dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl());
var conn = dataSource.getConnection();
var query = conn.prepareStatement("""
SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK
""")
) {
var domainService = new DomainLoaderService(dataSource, new ProcessConfiguration("test", 1, UUID.randomUUID()));
var input = new LoaderInputData(workDir, 2);
var domainRegistry = domainService.getOrCreateDomainIds(input);
var dls = new DomainLinksLoaderService(dataSource, new ProcessConfiguration("test", 1, UUID.randomUUID()));
dls.loadLinks(domainRegistry, heartbeat, input);
Map<Integer, Set<Integer>> expected = new HashMap<>();
Map<Integer, Set<Integer>> actual = new HashMap<>();
expected.put(domainRegistry.getDomainId("www.marginalia.nu"), new HashSet<>());
expected.put(domainRegistry.getDomainId("search.marginalia.nu"), new HashSet<>());
for (var domain : linkDomains1) {
expected.get(domainRegistry.getDomainId("www.marginalia.nu")).add(domainRegistry.getDomainId(domain));
}
for (var domain : linkDomains2) {
expected.get(domainRegistry.getDomainId("search.marginalia.nu")).add(domainRegistry.getDomainId(domain));
}
var rs = query.executeQuery();
while (rs.next()) {
actual.computeIfAbsent(rs.getInt(1), k -> new HashSet<>())
.add(rs.getInt(2));
}
assertEquals(expected, actual);
}
}
private DomainRecord dr(String domainName) {
return new DomainRecord(domainName, 0, 0, 0, null, null, null, null);
}
private DomainLinkRecord dl(String sourceDomainName, String destDomainName) {
return new DomainLinkRecord(sourceDomainName, destDomainName);
}
}

View File

@ -21,7 +21,9 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:common:process')
implementation project(':code:common:service-client')
implementation project(':code:common:service')
implementation project(':code:api:query-api')
implementation libs.bundles.slf4j

View File

@ -1,26 +1,25 @@
package nu.marginalia.adjacencies;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.query.client.QueryClient;
import org.roaringbitmap.RoaringBitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
public class AdjacenciesData {
TIntList idsList = new TIntArrayList(100_000);
ArrayList<SparseBitVector> itemsList = new ArrayList<>(100_000);
private static final Logger logger = LoggerFactory.getLogger(AdjacenciesData.class);
private final TIntList idsList = new TIntArrayList(100_000);
private final ArrayList<SparseBitVector> itemsList = new ArrayList<>(100_000);
TIntObjectHashMap<SparseBitVector> dToSMap = new TIntObjectHashMap<>(100_000);
TIntObjectHashMap<RoaringBitmap> sToDMap = new TIntObjectHashMap<>(100_000);
RoaringBitmap indexed = new RoaringBitmap();
private final TIntObjectHashMap<SparseBitVector> dToSMap = new TIntObjectHashMap<>(100_000);
private final TIntObjectHashMap<RoaringBitmap> sToDMap = new TIntObjectHashMap<>(100_000);
public TIntHashSet getCandidates(SparseBitVector vec) {
TIntHashSet ret = new TIntHashSet();
@ -36,39 +35,31 @@ public class AdjacenciesData {
return ret;
}
public AdjacenciesData(HikariDataSource dataSource, DomainAliases aliases) throws SQLException {
public AdjacenciesData(QueryClient queryClient,
DomainAliases aliases) {
logger.info("Loading adjacency data");
Map<Integer, RoaringBitmap> tmpMapDtoS = new HashMap<>(100_000);
try (
var conn = dataSource.getConnection();
var indexedStmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE INDEXED>0");
var linksStmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
ResultSet rsp;
indexedStmt.setFetchSize(10_000);
rsp = indexedStmt.executeQuery();
while (rsp.next()) {
indexed.add(rsp.getInt(1));
int count = 0;
var allLinks = queryClient.getAllDomainLinks();
for (var iter = allLinks.iterator();;count++) {
if (!iter.advance()) {
break;
}
int source = aliases.deAlias(iter.source());
int dest = aliases.deAlias(iter.dest());
linksStmt.setFetchSize(10_000);
rsp = linksStmt.executeQuery();
while (rsp.next()) {
int source = aliases.deAlias(rsp.getInt(1));
int dest = aliases.deAlias(rsp.getInt(2));
tmpMapDtoS.computeIfAbsent(dest, this::createBitmapWithSelf).add(source);
RoaringBitmap sToDEntry = sToDMap.get(source);
if (sToDEntry == null) {
sToDEntry = new RoaringBitmap();
sToDMap.put(source, sToDEntry);
sToDEntry.add(source);
}
sToDEntry.add(dest);
tmpMapDtoS.computeIfAbsent(dest, this::createBitmapWithSelf).add(source);
RoaringBitmap sToDEntry = sToDMap.get(source);
if (sToDEntry == null) {
sToDEntry = new RoaringBitmap();
sToDMap.put(source, sToDEntry);
sToDEntry.add(source);
}
sToDEntry.add(dest);
}
logger.info("Links loaded: {}", count);
tmpMapDtoS.entrySet().stream()
.filter(e -> isEligible(e.getValue()))
@ -79,10 +70,10 @@ public class AdjacenciesData {
dToSMap.put(e.getKey(), val);
});
logger.info("All adjacency dat loaded");
}
private boolean isEligible(RoaringBitmap value) {
// return true;
int cardinality = value.getCardinality();
return cardinality < 10000;
@ -95,10 +86,6 @@ public class AdjacenciesData {
return bm;
}
public boolean isIndexedDomain(int domainId) {
return indexed.contains(domainId);
}
public TIntList getIdsList() {
return idsList;
}

View File

@ -7,7 +7,10 @@ import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeatImpl;
import nu.marginalia.query.client.QueryClient;
import nu.marginalia.service.module.DatabaseModule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.*;
@ -23,13 +26,14 @@ public class WebsiteAdjacenciesCalculator {
private final HikariDataSource dataSource;
public AdjacenciesData adjacenciesData;
public DomainAliases domainAliases;
private static final Logger logger = LoggerFactory.getLogger(WebsiteAdjacenciesCalculator.class);
float[] weights;
public WebsiteAdjacenciesCalculator(HikariDataSource dataSource) throws SQLException {
public WebsiteAdjacenciesCalculator(QueryClient queryClient, HikariDataSource dataSource) throws SQLException {
this.dataSource = dataSource;
domainAliases = new DomainAliases(dataSource);
adjacenciesData = new AdjacenciesData(dataSource, domainAliases);
adjacenciesData = new AdjacenciesData(queryClient, domainAliases);
weights = adjacenciesData.getWeights();
}
@ -47,7 +51,6 @@ public class WebsiteAdjacenciesCalculator {
for (int domainId : domainIds) {
findAdjacentDtoS(domainId, similarities -> {
for (var similarity : similarities.similarities()) {
if (adjacenciesData.isIndexedDomain(similarity.domainId)) System.out.print("*");
System.out.println(dataStoreDao.getDomain(similarity.domainId).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value));
}
});
@ -186,8 +189,9 @@ public class WebsiteAdjacenciesCalculator {
DatabaseModule dm = new DatabaseModule();
var dataSource = dm.provideConnection();
var qc = new QueryClient();
var main = new WebsiteAdjacenciesCalculator(dataSource);
var main = new WebsiteAdjacenciesCalculator(qc, dataSource);
if (args.length == 1 && "load".equals(args[0])) {
var processHeartbeat = new ProcessHeartbeatImpl(
@ -195,9 +199,16 @@ public class WebsiteAdjacenciesCalculator {
dataSource
);
processHeartbeat.start();
main.loadAll(processHeartbeat);
processHeartbeat.shutDown();
try {
processHeartbeat.start();
main.loadAll(processHeartbeat);
}
catch (Exception ex) {
logger.error("Failed to load", ex);
}
finally {
processHeartbeat.shutDown();
}
return;
}

View File

@ -66,7 +66,7 @@ public class BrowseCommand implements SearchCommandInterface {
return browseService.getRandomEntries(set);
}
else {
return browseService.getRelatedEntries(word);
return browseService.getRelatedEntries(ctx, word);
}
}
catch (Exception ex) {

View File

@ -2,6 +2,7 @@ package nu.marginalia.search.results;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.assistant.client.model.SimilarDomain;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.screenshot.ScreenshotService;
@ -18,7 +19,7 @@ public class BrowseResultCleaner {
this.screenshotService = screenshotService;
}
public Predicate<BrowseResult> shouldRemoveResultPredicate() {
public Predicate<BrowseResult> shouldRemoveResultPredicateBr() {
Set<String> domainHashes = new HashSet<>(100);
return (res) -> !screenshotService.hasScreenshot(res.domainId())

View File

@ -1,16 +1,18 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.assistant.client.AssistantClient;
import nu.marginalia.assistant.client.model.SimilarDomain;
import nu.marginalia.browse.DbBrowseDomainsRandom;
import nu.marginalia.browse.DbBrowseDomainsSimilarCosine;
import nu.marginalia.browse.DbBrowseDomainsSimilarOldAlgo;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.browse.model.BrowseResultSet;
import nu.marginalia.client.Context;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.search.results.BrowseResultCleaner;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@ -19,55 +21,60 @@ import static java.util.Collections.shuffle;
public class SearchBrowseService {
private final DbBrowseDomainsRandom randomDomains;
private final DbBrowseDomainsSimilarCosine similarDomains;
private final DbBrowseDomainsSimilarOldAlgo similarDomainsOld;
private final DbDomainQueries domainQueries;
private final DomainBlacklist blacklist;
private final AssistantClient assistantClient;
private final BrowseResultCleaner browseResultCleaner;
@Inject
public SearchBrowseService(DbBrowseDomainsRandom randomDomains,
DbBrowseDomainsSimilarCosine similarDomains,
DbBrowseDomainsSimilarOldAlgo similarDomainsOld,
DbDomainQueries domainQueries,
DomainBlacklist blacklist,
AssistantClient assistantClient,
BrowseResultCleaner browseResultCleaner)
{
this.randomDomains = randomDomains;
this.similarDomains = similarDomains;
this.similarDomainsOld = similarDomainsOld;
this.domainQueries = domainQueries;
this.blacklist = blacklist;
this.assistantClient = assistantClient;
this.browseResultCleaner = browseResultCleaner;
}
public BrowseResultSet getRandomEntries(int set) {
List<BrowseResult> results = randomDomains.getRandomDomains(25, blacklist, set);
results.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
results.removeIf(browseResultCleaner.shouldRemoveResultPredicateBr());
return new BrowseResultSet(results);
}
public BrowseResultSet getRelatedEntries(String word) {
var domain = domainQueries.getDomainId(new EdgeDomain(word));
public BrowseResultSet getRelatedEntries(Context ctx, String domainName) {
var domain = domainQueries.getDomainId(new EdgeDomain(domainName));
var neighbors = similarDomains.getDomainNeighborsAdjacentCosineRequireScreenshot(domain, blacklist, 256);
neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
var neighbors = assistantClient.similarDomains(ctx, domain, 50).blockingFirst();
neighbors.removeIf(sd -> !sd.screenshot());
// If the results are very few, supplement with the alternative shitty algorithm
if (neighbors.size() < 25) {
Set<BrowseResult> allNeighbors = new HashSet<>(neighbors);
allNeighbors.addAll(similarDomainsOld.getDomainNeighborsAdjacent(domain, blacklist, 50));
Set<SimilarDomain> allNeighbors = new HashSet<>(neighbors);
allNeighbors.addAll(assistantClient.linkedDomains(ctx, domain, 50).blockingFirst());
neighbors.clear();
neighbors.addAll(allNeighbors);
neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
neighbors.removeIf(sd -> !sd.screenshot());
}
List<BrowseResult> results = new ArrayList<>(neighbors.size());
for (SimilarDomain sd : neighbors) {
var resultDomain = domainQueries.getDomain(sd.domainId());
if (resultDomain.isEmpty())
continue;
results.add(new BrowseResult(resultDomain.get().toRootUrl(), sd.domainId(), 0, sd.screenshot()));
}
// shuffle the items for a less repetitive experience
shuffle(neighbors);
return new BrowseResultSet(neighbors, word);
return new BrowseResultSet(results, domainName);
}
}

View File

@ -24,6 +24,7 @@ java {
dependencies {
implementation project(':third-party:symspell')
implementation project(':code:api:assistant-api')
implementation project(':code:api:query-api')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:common:model')

View File

@ -5,6 +5,7 @@ import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.assistant.client.model.DomainInformation;
import nu.marginalia.query.client.QueryClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -20,6 +21,7 @@ public class DomainInformationService {
private final GeoIpDictionary geoIpDictionary;
private DbDomainQueries dbDomainQueries;
private final QueryClient queryClient;
private HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -27,9 +29,11 @@ public class DomainInformationService {
public DomainInformationService(
DbDomainQueries dbDomainQueries,
GeoIpDictionary geoIpDictionary,
QueryClient queryClient,
HikariDataSource dataSource) {
this.dbDomainQueries = dbDomainQueries;
this.geoIpDictionary = geoIpDictionary;
this.queryClient = queryClient;
this.dataSource = dataSource;
}
@ -80,21 +84,8 @@ public class DomainInformationService {
inCrawlQueue = rs.next();
builder.inCrawlQueue(inCrawlQueue);
rs = stmt.executeQuery(STR."""
SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=\{domainId}
""");
if (rs.next()) {
builder.incomingLinks(rs.getInt(1));
}
rs = stmt.executeQuery(STR."""
SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=\{domainId}
""");
if (rs.next()) {
builder.outboundLinks(rs.getInt(1));
outboundLinks = rs.getInt(1);
}
builder.incomingLinks(queryClient.countLinksToDomain(domainId));
builder.outboundLinks(queryClient.countLinksFromDomain(domainId));
rs = stmt.executeQuery(STR."""
SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId}

View File

@ -10,6 +10,7 @@ import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.assistant.client.model.SimilarDomain;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.query.client.QueryClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -25,14 +26,13 @@ public class SimilarDomainsService {
private static final Logger logger = LoggerFactory.getLogger(SimilarDomainsService.class);
private final HikariDataSource dataSource;
private final QueryClient queryClient;
private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000);
private volatile int[] domainIdxToId;
public volatile TIntDoubleHashMap[] relatedDomains;
public volatile TIntList[] domainNeighbors = null;
public volatile TIntList[] linkStoD = null;
public volatile TIntList[] linkDtoS = null;
public volatile BitSet screenshotDomains = null;
public volatile BitSet activeDomains = null;
public volatile BitSet indexedDomains = null;
@ -42,8 +42,9 @@ public class SimilarDomainsService {
volatile boolean isReady = false;
@Inject
public SimilarDomainsService(HikariDataSource dataSource) {
public SimilarDomainsService(HikariDataSource dataSource, QueryClient queryClient) {
this.dataSource = dataSource;
this.queryClient = queryClient;
Executors.newSingleThreadExecutor().submit(this::init);
}
@ -70,8 +71,6 @@ public class SimilarDomainsService {
domainRanks = new double[domainIdToIdx.size()];
domainNames = new String[domainIdToIdx.size()];
domainNeighbors = new TIntList[domainIdToIdx.size()];
linkStoD = new TIntList[domainIdToIdx.size()];
linkDtoS = new TIntList[domainIdToIdx.size()];
screenshotDomains = new BitSet(domainIdToIdx.size());
activeDomains = new BitSet(domainIdToIdx.size());
indexedDomains = new BitSet(domainIdToIdx.size());
@ -108,27 +107,6 @@ public class SimilarDomainsService {
logger.info("Loaded {} related domains", relatedDomains.length);
rs = stmt.executeQuery("""
SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK
""");
while (rs.next()) {
int source = rs.getInt(1);
int dest = rs.getInt(2);
int sourceIdx = domainIdToIdx.get(source);
int destIdx = domainIdToIdx.get(dest);
if (linkStoD[sourceIdx] == null)
linkStoD[sourceIdx] = new TIntArrayList(32);
if (linkDtoS[destIdx] == null)
linkDtoS[destIdx] = new TIntArrayList(32);
linkStoD[sourceIdx].add(destIdx);
linkDtoS[destIdx].add(sourceIdx);
}
logger.info("Loaded links...");
rs = stmt.executeQuery("""
SELECT EC_DOMAIN.ID,
@ -167,7 +145,6 @@ public class SimilarDomainsService {
}
logger.info("Loaded {} domains", domainRanks.length);
logger.info("All done!");
isReady = true;
}
}
@ -272,17 +249,23 @@ public class SimilarDomainsService {
}
private TIntSet getLinkingIdsDToS(int domainIdx) {
var items = linkDtoS[domainIdx];
if (items == null)
return new TIntHashSet();
return new TIntHashSet(items);
var items = new TIntHashSet();
for (int id : queryClient.getLinksFromDomain(domainIdxToId[domainIdx])) {
items.add(domainIdToIdx.get(id));
}
return items;
}
private TIntSet getLinkingIdsSToD(int domainIdx) {
var items = linkStoD[domainIdx];
if (items == null)
return new TIntHashSet();
return new TIntHashSet(items);
var items = new TIntHashSet();
for (int id : queryClient.getLinksToDomain(domainIdxToId[domainIdx])) {
items.add(domainIdToIdx.get(id));
}
return items;
}
public List<SimilarDomain> getLinkingDomains(int domainId, int count) {

View File

@ -26,6 +26,7 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:process')
implementation project(':code:common:db')
implementation project(':code:common:linkdb')
implementation project(':code:common:service')
implementation project(':code:common:service-client')

View File

@ -76,7 +76,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
}
Path crawlDataPath = inputDir.resolve(item.relPath());
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
exportLinks(tagWriter, stream);
}
catch (Exception ex) {

View File

@ -4,9 +4,6 @@ import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import lombok.With;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.storage.FileStorageService;

View File

@ -3,6 +3,7 @@ package nu.marginalia.svc;
import com.github.luben.zstd.ZstdInputStream;
import com.github.luben.zstd.ZstdOutputStream;
import nu.marginalia.IndexLocations;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId;
@ -18,13 +19,26 @@ import java.sql.SQLException;
import java.time.LocalDateTime;
import java.util.List;
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
import static nu.marginalia.linkdb.LinkdbFileNames.DOMAIN_LINKS_FILE_NAME;
public class BackupService {
private final FileStorageService storageService;
private final ServiceHeartbeat serviceHeartbeat;
public enum BackupHeartbeatSteps {
LINKS,
DOCS,
JOURNAL,
DONE
}
@Inject
public BackupService(FileStorageService storageService) {
public BackupService(FileStorageService storageService,
ServiceHeartbeat serviceHeartbeat) {
this.storageService = storageService;
this.serviceHeartbeat = serviceHeartbeat;
}
/** Create a new backup of the contents in the _STAGING storage areas.
@ -42,13 +56,25 @@ public class BackupService {
storageService.relateFileStorages(associatedId, backupStorage.id());
}
var indexStagingStorage = IndexLocations.getIndexConstructionArea(storageService);
var linkdbStagingStorage = IndexLocations.getLinkdbWritePath(storageService);
backupFileCompressed("links.db", linkdbStagingStorage, backupStorage.asPath());
// This file format is already compressed
backupJournal(indexStagingStorage, backupStorage.asPath());
try (var heartbeat = serviceHeartbeat.createServiceTaskHeartbeat(BackupHeartbeatSteps.class, "Backup")) {
heartbeat.progress(BackupHeartbeatSteps.DOCS);
backupFileCompressed(DOCDB_FILE_NAME, linkdbStagingStorage, backupStorage.asPath());
heartbeat.progress(BackupHeartbeatSteps.LINKS);
backupFileCompressed(DOMAIN_LINKS_FILE_NAME, linkdbStagingStorage, backupStorage.asPath());
heartbeat.progress(BackupHeartbeatSteps.JOURNAL);
// This file format is already compressed
backupJournal(indexStagingStorage, backupStorage.asPath());
heartbeat.progress(BackupHeartbeatSteps.DONE);
}
}
@ -59,8 +85,18 @@ public class BackupService {
var indexStagingStorage = IndexLocations.getIndexConstructionArea(storageService);
var linkdbStagingStorage = IndexLocations.getLinkdbWritePath(storageService);
restoreBackupCompressed("links.db", linkdbStagingStorage, backupStorage);
restoreJournal(indexStagingStorage, backupStorage);
try (var heartbeat = serviceHeartbeat.createServiceTaskHeartbeat(BackupHeartbeatSteps.class, "Restore Backup")) {
heartbeat.progress(BackupHeartbeatSteps.DOCS);
restoreBackupCompressed(DOCDB_FILE_NAME, linkdbStagingStorage, backupStorage);
heartbeat.progress(BackupHeartbeatSteps.LINKS);
restoreBackupCompressed(DOMAIN_LINKS_FILE_NAME, linkdbStagingStorage, backupStorage);
heartbeat.progress(BackupHeartbeatSteps.JOURNAL);
restoreJournal(indexStagingStorage, backupStorage);
heartbeat.progress(BackupHeartbeatSteps.DONE);
}
}

View File

@ -4,17 +4,28 @@ import com.google.inject.AbstractModule;
import com.google.inject.Provides;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.linkdb.dlinks.DomainLinkDb;
import nu.marginalia.linkdb.dlinks.FileDomainLinkDb;
import nu.marginalia.linkdb.dlinks.SelectingDomainLinkDb;
import nu.marginalia.linkdb.dlinks.SqlDomainLinkDb;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.config.RankingSettings;
import nu.marginalia.WmsaHome;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import static nu.marginalia.linkdb.LinkdbFileNames.*;
public class IndexModule extends AbstractModule {
private static final Logger logger = LoggerFactory.getLogger(IndexModule.class);
public void configure() {
}
@ -25,11 +36,42 @@ public class IndexModule extends AbstractModule {
return RankingSettings.from(dir);
}
@Provides
@Singleton
public DomainLinkDb domainLinkDb (
FileStorageService storageService,
HikariDataSource dataSource,
ServiceConfiguration serviceConfiguration
)
{
Path path = IndexLocations.getLinkdbLivePath(storageService).resolve(DOMAIN_LINKS_FILE_NAME);
return new SelectingDomainLinkDb(path, serviceConfiguration, dataSource);
}
@Provides
@Singleton
@Named("linkdb-file")
public Path linkdbPath(FileStorageService storageService) throws SQLException {
return IndexLocations.getLinkdbLivePath(storageService).resolve("links.db");
@Named("docdb-file")
public Path linkdbPath(FileStorageService storageService) throws IOException {
// Migrate from old location
Path migrationMarker = IndexLocations.getLinkdbLivePath(storageService).resolve("migrated-links.db-to-documents.db");
Path oldPath = IndexLocations.getLinkdbLivePath(storageService).resolve(DEPRECATED_LINKDB_FILE_NAME);
Path newPath = IndexLocations.getLinkdbLivePath(storageService).resolve(DOCDB_FILE_NAME);
if (Files.exists(oldPath) && !Files.exists(newPath) && !Files.exists(migrationMarker)) {
logger.info("Migrating {} to {}", oldPath, newPath);
Files.move(oldPath, newPath);
Files.createFile(migrationMarker);
}
return newPath;
}
@Provides
@Singleton
@Named("domain-linkdb-file")
public Path domainLinkDbFile(FileStorageService storageService) throws SQLException {
return IndexLocations.getLinkdbLivePath(storageService).resolve(DOMAIN_LINKS_FILE_NAME);
}
}

View File

@ -6,12 +6,14 @@ import io.grpc.ServerBuilder;
import io.reactivex.rxjava3.schedulers.Schedulers;
import lombok.SneakyThrows;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.svc.IndexDomainLinksService;
import nu.marginalia.linkdb.dlinks.DomainLinkDb;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.client.IndexMqEndpoints;
import nu.marginalia.index.index.SearchIndex;
import nu.marginalia.index.svc.IndexOpsService;
import nu.marginalia.index.svc.IndexQueryService;
import nu.marginalia.linkdb.LinkdbReader;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.server.*;
@ -28,6 +30,8 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.TimeUnit;
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
import static nu.marginalia.linkdb.LinkdbFileNames.DOMAIN_LINKS_FILE_NAME;
import static spark.Spark.get;
public class IndexService extends Service {
@ -38,8 +42,9 @@ public class IndexService extends Service {
private final IndexOpsService opsService;
private final SearchIndex searchIndex;
private final FileStorageService fileStorageService;
private final LinkdbReader linkdbReader;
private final DocumentDbReader documentDbReader;
private final DomainLinkDb domainLinkDb;
private final ServiceEventLog eventLog;
@ -49,14 +54,17 @@ public class IndexService extends Service {
IndexQueryService indexQueryService,
SearchIndex searchIndex,
FileStorageService fileStorageService,
LinkdbReader linkdbReader,
DocumentDbReader documentDbReader,
DomainLinkDb domainLinkDb,
IndexDomainLinksService indexDomainLinksService,
ServiceEventLog eventLog) throws IOException {
super(params);
this.opsService = opsService;
this.searchIndex = searchIndex;
this.fileStorageService = fileStorageService;
this.linkdbReader = linkdbReader;
this.documentDbReader = documentDbReader;
this.domainLinkDb = domainLinkDb;
this.eventLog = eventLog;
final Gson gson = GsonFactory.get();
@ -65,6 +73,7 @@ public class IndexService extends Service {
var grpcServer = ServerBuilder.forPort(params.configuration.port() + 1)
.addService(indexQueryService)
.addService(indexDomainLinksService)
.build();
grpcServer.start();
@ -99,15 +108,24 @@ public class IndexService extends Service {
@SneakyThrows
@MqRequest(endpoint = IndexMqEndpoints.SWITCH_LINKDB)
public void switchLinkdb(String unusedArg) {
logger.info("Switching link database");
logger.info("Switching link databases");
Path newPath = IndexLocations
Path newPathDocs = IndexLocations
.getLinkdbWritePath(fileStorageService)
.resolve("links.db");
.resolve(DOCDB_FILE_NAME);
if (Files.exists(newPath)) {
eventLog.logEvent("INDEX-SWITCH-LINKDB", "");
linkdbReader.switchInput(newPath);
if (Files.exists(newPathDocs)) {
eventLog.logEvent("INDEX-SWITCH-DOCKDB", "");
documentDbReader.switchInput(newPathDocs);
}
Path newPathDomains = IndexLocations
.getLinkdbWritePath(fileStorageService)
.resolve(DOMAIN_LINKS_FILE_NAME);
if (Files.exists(newPathDomains)) {
eventLog.logEvent("INDEX-SWITCH-DOMAIN-LINKDB", "");
domainLinkDb.switchInput(newPathDomains);
}
}

View File

@ -7,8 +7,8 @@ import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
import nu.marginalia.index.client.model.results.ResultRankingContext;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.linkdb.LinkdbReader;
import nu.marginalia.linkdb.model.LdbUrlDetail;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.ranking.ResultValuator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -25,13 +25,13 @@ public class IndexResultDecorator {
private static final Logger logger = LoggerFactory.getLogger(IndexResultDecorator.class);
private final LinkdbReader linkdbReader;
private final DocumentDbReader documentDbReader;
private final ResultValuator valuator;
@Inject
public IndexResultDecorator(LinkdbReader linkdbReader,
public IndexResultDecorator(DocumentDbReader documentDbReader,
ResultValuator valuator) {
this.linkdbReader = linkdbReader;
this.documentDbReader = documentDbReader;
this.valuator = valuator;
}
@ -46,9 +46,9 @@ public class IndexResultDecorator {
for (var result : rawResults)
idsList.add(result.getDocumentId());
Map<Long, LdbUrlDetail> urlDetailsById = new HashMap<>(rawResults.size());
Map<Long, DocdbUrlDetail> urlDetailsById = new HashMap<>(rawResults.size());
for (var item : linkdbReader.getUrlDetails(idsList))
for (var item : documentDbReader.getUrlDetails(idsList))
urlDetailsById.put(item.urlId(), item);
List<DecoratedSearchResultItem> decoratedItems = new ArrayList<>();
@ -63,7 +63,7 @@ public class IndexResultDecorator {
}
private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
LdbUrlDetail linkData,
DocdbUrlDetail linkData,
ResultRankingContext rankingContext) {
return new DecoratedSearchResultItem(
result,

View File

@ -0,0 +1,102 @@
package nu.marginalia.index.svc;
import com.google.inject.Inject;
import io.grpc.stub.StreamObserver;
import nu.marginalia.index.api.*;
import nu.marginalia.linkdb.dlinks.DomainLinkDb;
/** GRPC service for interrogating domain links
*/
public class IndexDomainLinksService extends IndexDomainLinksApiGrpc.IndexDomainLinksApiImplBase {
private final DomainLinkDb domainLinkDb;
@Inject
public IndexDomainLinksService(DomainLinkDb domainLinkDb) {
this.domainLinkDb = domainLinkDb;
}
public void getAllLinks(nu.marginalia.index.api.Empty request,
io.grpc.stub.StreamObserver<nu.marginalia.index.api.RpcDomainIdPairs> responseObserver) {
try (var idsConverter = new AllIdsResponseConverter(responseObserver)) {
domainLinkDb.forEach(idsConverter::accept);
}
responseObserver.onCompleted();
}
private static class AllIdsResponseConverter implements AutoCloseable {
private RpcDomainIdPairs.Builder builder;
private final io.grpc.stub.StreamObserver<RpcDomainIdPairs> responseObserver;
private int n = 0;
private AllIdsResponseConverter(io.grpc.stub.StreamObserver<RpcDomainIdPairs> responseObserver) {
this.responseObserver = responseObserver;
this.builder = RpcDomainIdPairs.newBuilder();
}
public void accept(int source, int dest) {
builder.addSourceIds(source);
builder.addDestIds(dest);
if (++n > 1000) {
responseObserver.onNext(builder.build());
builder = RpcDomainIdPairs.newBuilder();
n = 0;
}
}
@Override
public void close() {
if (n > 0) {
responseObserver.onNext(builder.build());
}
}
}
@Override
public void getLinksFromDomain(RpcDomainId request,
StreamObserver<RpcDomainIdList> responseObserver) {
var links = domainLinkDb.findDestinations(request.getDomainId());
var rspBuilder = RpcDomainIdList.newBuilder();
for (int i = 0; i < links.size(); i++) {
rspBuilder.addDomainId(links.get(i));
}
responseObserver.onNext(rspBuilder.build());
responseObserver.onCompleted();
}
@Override
public void getLinksToDomain(RpcDomainId request,
StreamObserver<RpcDomainIdList> responseObserver) {
var links = domainLinkDb.findSources(request.getDomainId());
var rspBuilder = RpcDomainIdList.newBuilder();
for (int i = 0; i < links.size(); i++) {
rspBuilder.addDomainId(links.get(i));
}
responseObserver.onNext(rspBuilder.build());
responseObserver.onCompleted();
}
public void countLinksFromDomain(RpcDomainId request,
StreamObserver<RpcDomainIdCount> responseObserver) {
responseObserver.onNext(RpcDomainIdCount.newBuilder()
.setIdCount(domainLinkDb.countDestinations(request.getDomainId()))
.build());
responseObserver.onCompleted();
}
public void countLinksToDomain(RpcDomainId request,
StreamObserver<RpcDomainIdCount> responseObserver) {
responseObserver.onNext(RpcDomainIdCount.newBuilder()
.setIdCount(domainLinkDb.countSources(request.getDomainId()))
.build());
responseObserver.onCompleted();
}
}

View File

@ -59,6 +59,7 @@ public class IndexOpsService {
public <T> Optional<T> run(Callable<T> c) throws Exception {
if (!opsLock.tryLock())
return Optional.empty();
try {
return Optional.of(c.call());
}

View File

@ -24,9 +24,9 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.linkdb.LinkdbReader;
import nu.marginalia.linkdb.LinkdbWriter;
import nu.marginalia.linkdb.model.LdbUrlDetail;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.WordFlags;
@ -51,6 +51,7 @@ import java.sql.SQLException;
import java.util.*;
import java.util.stream.IntStream;
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
@ -80,7 +81,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
DomainRankings domainRankings;
@Inject
LinkdbReader linkdbReader;
DocumentDbReader documentDbReader;
@Inject
ProcessHeartbeat processHeartbeat;
@ -103,15 +104,15 @@ public class IndexQueryServiceIntegrationSmokeTest {
@Test
public void willItBlend() throws Exception {
var linkdbWriter = new LinkdbWriter(
var linkdbWriter = new DocumentDbWriter(
IndexLocations.getLinkdbLivePath(fileStorageService)
.resolve("links.db")
.resolve(DOCDB_FILE_NAME)
);
for (int i = 1; i < 512; i++) {
loadData(linkdbWriter, i);
}
linkdbWriter.close();
linkdbReader.reconnect();
documentDbReader.reconnect();
indexJournalWriter.close();
constructIndex();
@ -146,15 +147,15 @@ public class IndexQueryServiceIntegrationSmokeTest {
@Test
public void testDomainQuery() throws Exception {
var linkdbWriter = new LinkdbWriter(
var linkdbWriter = new DocumentDbWriter(
IndexLocations.getLinkdbLivePath(fileStorageService)
.resolve("links.db")
.resolve(DOCDB_FILE_NAME)
);
for (int i = 1; i < 512; i++) {
loadDataWithDomain(linkdbWriter, i/100, i);
}
linkdbWriter.close();
linkdbReader.reconnect();
documentDbReader.reconnect();
indexJournalWriter.close();
constructIndex();
@ -183,15 +184,15 @@ public class IndexQueryServiceIntegrationSmokeTest {
@Test
public void testYearQuery() throws Exception {
var linkdbWriter = new LinkdbWriter(
var linkdbWriter = new DocumentDbWriter(
IndexLocations.getLinkdbLivePath(fileStorageService)
.resolve("links.db")
.resolve(DOCDB_FILE_NAME)
);
for (int i = 1; i < 512; i++) {
loadData(linkdbWriter, i);
}
linkdbWriter.close();
linkdbReader.reconnect();
documentDbReader.reconnect();
indexJournalWriter.close();
constructIndex();
@ -283,7 +284,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
MurmurHash3_128 hasher = new MurmurHash3_128();
@SneakyThrows
public void loadData(LinkdbWriter ldbw, int id) {
public void loadData(DocumentDbWriter ldbw, int id) {
int[] factors = IntStream
.rangeClosed(1, id)
.filter(v -> (id % v) == 0)
@ -299,7 +300,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
data[2 * i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
ldbw.add(new LdbUrlDetail(
ldbw.add(new DocdbUrlDetail(
fullId, new EdgeUrl("https://www.example.com/"+id),
"test", "test", 0., "HTML5", 0, null, 0, 10
));
@ -308,7 +309,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
}
@SneakyThrows
public void loadDataWithDomain(LinkdbWriter ldbw, int domain, int id) {
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
long fullId = UrlIdCodec.encodeId(domain, id);
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, DocumentMetadata.defaultValue());
@ -319,7 +320,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
ldbw.add(new LdbUrlDetail(
ldbw.add(new DocdbUrlDetail(
fullId, new EdgeUrl("https://www.example.com/"+id),
"test", "test", 0., "HTML5", 0, null, 0, 10
));

View File

@ -23,9 +23,9 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.linkdb.LinkdbReader;
import nu.marginalia.linkdb.LinkdbWriter;
import nu.marginalia.linkdb.model.LdbUrlDetail;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.id.UrlIdCodec;
@ -53,6 +53,7 @@ import java.sql.SQLException;
import java.util.*;
import java.util.function.Function;
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
@ -84,7 +85,7 @@ public class IndexQueryServiceIntegrationTest {
@Inject
ProcessHeartbeat processHeartbeat;
@Inject
LinkdbReader linkdbReader;
DocumentDbReader documentDbReader;
@BeforeEach
public void setUp() throws IOException {
@ -566,11 +567,11 @@ public class IndexQueryServiceIntegrationTest {
indexJournalWriter.put(header, entry);
});
var linkdbWriter = new LinkdbWriter(
IndexLocations.getLinkdbLivePath(fileStorageService).resolve("links.db")
var linkdbWriter = new DocumentDbWriter(
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)
);
for (Long key : allData.keySet()) {
linkdbWriter.add(new LdbUrlDetail(
linkdbWriter.add(new DocdbUrlDetail(
key,
new EdgeUrl("https://www.example.com"),
"test",
@ -587,7 +588,7 @@ public class IndexQueryServiceIntegrationTest {
indexJournalWriter.close();
constructIndex();
linkdbReader.reconnect();
documentDbReader.reconnect();
searchIndex.switchIndex();
}
}

View File

@ -7,7 +7,7 @@ import nu.marginalia.storage.model.FileStorageBase;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
import nu.marginalia.linkdb.LinkdbReader;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.ranking.DomainRankings;
@ -26,6 +26,7 @@ import java.sql.SQLException;
import java.util.Random;
import java.util.UUID;
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
import static org.mockito.Mockito.when;
public class IndexQueryServiceIntegrationTestModule extends AbstractModule {
@ -57,9 +58,9 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule {
Mockito.when(fileStorageServiceMock.getStorageBase(FileStorageBaseType.CURRENT)).thenReturn(new FileStorageBase(null, null, 0,null, fastDir.toString()));
Mockito.when(fileStorageServiceMock.getStorageBase(FileStorageBaseType.STORAGE)).thenReturn(new FileStorageBase(null, null, 0, null, fastDir.toString()));
bind(LinkdbReader.class).toInstance(new LinkdbReader(
bind(DocumentDbReader.class).toInstance(new DocumentDbReader(
IndexLocations.getLinkdbLivePath(fileStorageServiceMock)
.resolve("links.db")
.resolve(DOCDB_FILE_NAME)
));
bind(FileStorageService.class).toInstance(fileStorageServiceMock);

View File

@ -0,0 +1,96 @@
package nu.marginalia.query;
import com.google.inject.Inject;
import io.grpc.ManagedChannel;
import io.grpc.stub.StreamObserver;
import nu.marginalia.index.api.IndexDomainLinksApiGrpc;
import nu.marginalia.index.api.RpcDomainIdCount;
import nu.marginalia.index.api.RpcDomainIdList;
import nu.marginalia.index.api.RpcDomainIdPairs;
import nu.marginalia.query.svc.NodeConfigurationWatcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class QueryGRPCDomainLinksService extends IndexDomainLinksApiGrpc.IndexDomainLinksApiImplBase {
private static final Logger logger = LoggerFactory.getLogger(QueryGRPCDomainLinksService.class);
private final NodeConfigurationWatcher nodeConfigurationWatcher;
private final QueryGrpcStubPool<IndexDomainLinksApiGrpc.IndexDomainLinksApiBlockingStub> stubPool;
@Inject
public QueryGRPCDomainLinksService(NodeConfigurationWatcher nodeConfigurationWatcher) {
this.nodeConfigurationWatcher = nodeConfigurationWatcher;
stubPool = new QueryGrpcStubPool<>(nodeConfigurationWatcher) {
@Override
public IndexDomainLinksApiGrpc.IndexDomainLinksApiBlockingStub createStub(ManagedChannel channel) {
return IndexDomainLinksApiGrpc.newBlockingStub(channel);
}
};
}
@Override
public void getAllLinks(nu.marginalia.index.api.Empty request,
StreamObserver<RpcDomainIdPairs> responseObserver) {
stubPool.callEachSequential(stub -> stub.getAllLinks(request))
.forEach(
iter -> iter.forEachRemaining(responseObserver::onNext)
);
responseObserver.onCompleted();
}
@Override
public void getLinksFromDomain(nu.marginalia.index.api.RpcDomainId request,
StreamObserver<RpcDomainIdList> responseObserver) {
var rspBuilder = RpcDomainIdList.newBuilder();
stubPool.callEachSequential(stub -> stub.getLinksFromDomain(request))
.map(RpcDomainIdList::getDomainIdList)
.forEach(rspBuilder::addAllDomainId);
responseObserver.onNext(rspBuilder.build());
responseObserver.onCompleted();
}
@Override
public void getLinksToDomain(nu.marginalia.index.api.RpcDomainId request,
StreamObserver<RpcDomainIdList> responseObserver) {
var rspBuilder = RpcDomainIdList.newBuilder();
stubPool.callEachSequential(stub -> stub.getLinksToDomain(request))
.map(RpcDomainIdList::getDomainIdList)
.forEach(rspBuilder::addAllDomainId);
responseObserver.onNext(rspBuilder.build());
responseObserver.onCompleted();
}
@Override
public void countLinksFromDomain(nu.marginalia.index.api.RpcDomainId request,
StreamObserver<RpcDomainIdCount> responseObserver) {
int sum = stubPool.callEachSequential(stub -> stub.countLinksFromDomain(request))
.mapToInt(RpcDomainIdCount::getIdCount)
.sum();
var rspBuilder = RpcDomainIdCount.newBuilder();
rspBuilder.setIdCount(sum);
responseObserver.onNext(rspBuilder.build());
responseObserver.onCompleted();
}
@Override
public void countLinksToDomain(nu.marginalia.index.api.RpcDomainId request,
io.grpc.stub.StreamObserver<nu.marginalia.index.api.RpcDomainIdCount> responseObserver) {
int sum = stubPool.callEachSequential(stub -> stub.countLinksToDomain(request))
.mapToInt(RpcDomainIdCount::getIdCount)
.sum();
var rspBuilder = RpcDomainIdCount.newBuilder();
rspBuilder.setIdCount(sum);
responseObserver.onNext(rspBuilder.build());
responseObserver.onCompleted();
}
}

View File

@ -2,7 +2,6 @@ package nu.marginalia.query;
import com.google.inject.Inject;
import io.grpc.ManagedChannel;
import io.grpc.ManagedChannelBuilder;
import io.prometheus.client.Histogram;
import lombok.SneakyThrows;
import nu.marginalia.db.DomainBlacklist;
@ -10,7 +9,6 @@ import nu.marginalia.index.api.*;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.query.svc.NodeConfigurationWatcher;
import nu.marginalia.query.svc.QueryFactory;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -28,32 +26,7 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase {
.help("QS-side query time (GRPC endpoint)")
.register();
private final Map<ServiceAndNode, ManagedChannel> channels
= new ConcurrentHashMap<>();
private final Map<ServiceAndNode, IndexApiGrpc.IndexApiBlockingStub> actorRpcApis
= new ConcurrentHashMap<>();
private ManagedChannel getChannel(ServiceAndNode serviceAndNode) {
return channels.computeIfAbsent(serviceAndNode,
san -> ManagedChannelBuilder
.forAddress(serviceAndNode.getHostName(), 81)
.usePlaintext()
.build());
}
public IndexApiGrpc.IndexApiBlockingStub indexApi(int node) {
return actorRpcApis.computeIfAbsent(new ServiceAndNode("index-service", node), n ->
IndexApiGrpc.newBlockingStub(
getChannel(n)
)
);
}
record ServiceAndNode(String service, int node) {
public String getHostName() {
return service+"-"+node;
}
}
private final QueryGrpcStubPool<IndexApiGrpc.IndexApiBlockingStub> stubPool;
private final QueryFactory queryFactory;
private final DomainBlacklist blacklist;
@ -64,6 +37,13 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase {
this.queryFactory = queryFactory;
this.blacklist = blacklist;
this.nodeConfigurationWatcher = nodeConfigurationWatcher;
stubPool = new QueryGrpcStubPool<>(nodeConfigurationWatcher) {
@Override
public IndexApiGrpc.IndexApiBlockingStub createStub(ManagedChannel channel) {
return IndexApiGrpc.newBlockingStub(channel);
}
};
}
public void query(nu.marginalia.index.api.RpcQsQuery request,
@ -89,7 +69,6 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase {
responseBuilder.setDomain(query.domain);
responseObserver.onNext(responseBuilder.build());
responseObserver.onCompleted();
});
} catch (Exception e) {
@ -98,16 +77,13 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase {
}
}
private final ExecutorService es = Executors.newVirtualThreadPerTaskExecutor();
private static final Comparator<RpcDecoratedResultItem> comparator =
Comparator.comparing(RpcDecoratedResultItem::getRankingScore);
@SneakyThrows
private List<RpcDecoratedResultItem> executeQueries(RpcIndexQuery indexRequest, int totalSize) {
List<Callable<List<RpcDecoratedResultItem>>> tasks = createTasks(indexRequest);
return es.invokeAll(tasks).stream()
return stubPool.invokeAll(stub -> new QueryTask(stub, indexRequest))
.stream()
.filter(f -> f.state() == Future.State.SUCCESS)
.map(Future::resultNow)
.flatMap(List::stream)
@ -116,26 +92,30 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase {
.toList();
}
@NotNull
private List<Callable<List<RpcDecoratedResultItem>>> createTasks(RpcIndexQuery indexRequest) {
List<Callable<List<RpcDecoratedResultItem>>> tasks = new ArrayList<>();
private class QueryTask implements Callable<List<RpcDecoratedResultItem>> {
private final IndexApiGrpc.IndexApiBlockingStub stub;
private final RpcIndexQuery indexRequest;
for (var node : nodeConfigurationWatcher.getQueryNodes()) {
tasks.add(() -> {
var responseIter = indexApi(node).query(indexRequest);
var ret = new ArrayList<RpcDecoratedResultItem>();
while (responseIter.hasNext()) {
RpcDecoratedResultItem next = responseIter.next();
if (isBlacklisted(next))
continue;
ret.add(next);
}
return ret;
});
public QueryTask(IndexApiGrpc.IndexApiBlockingStub stub, RpcIndexQuery indexRequest) {
this.stub = stub;
this.indexRequest = indexRequest;
}
return tasks;
}
@Override
public List<RpcDecoratedResultItem> call() {
var rsp = stub.query(indexRequest);
List<RpcDecoratedResultItem> ret = new ArrayList<>();
while (rsp.hasNext()) {
RpcDecoratedResultItem next = rsp.next();
if (isBlacklisted(next))
continue;
ret.add(next);
}
return ret;
}
}
private boolean isBlacklisted(RpcDecoratedResultItem item) {
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));

View File

@ -0,0 +1,64 @@
package nu.marginalia.query;
import io.grpc.ManagedChannel;
import io.grpc.ManagedChannelBuilder;
import nu.marginalia.query.svc.NodeConfigurationWatcher;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;
import java.util.function.Function;
import java.util.stream.Stream;
public abstract class QueryGrpcStubPool<STUB> {
protected record ServiceAndNode(String service, int node) {
public String getHostName() {
return service+"-"+node;
}
}
private final NodeConfigurationWatcher nodeConfigurationWatcher;
private final Map<ServiceAndNode, ManagedChannel> channels = new ConcurrentHashMap<>();
private final Map<ServiceAndNode, STUB> actorRpcApis = new ConcurrentHashMap<>();
private final ExecutorService virtualExecutorService = Executors.newVirtualThreadPerTaskExecutor();
QueryGrpcStubPool(NodeConfigurationWatcher nodeConfigurationWatcher) {
this.nodeConfigurationWatcher = nodeConfigurationWatcher;
}
/** Get an API stub for the given node */
public STUB indexApi(int node) {
var san = new ServiceAndNode("index-service", node);
return actorRpcApis.computeIfAbsent(san, n ->
createStub(channels.computeIfAbsent(san, this::createChannel))
);
}
protected ManagedChannel createChannel(ServiceAndNode serviceAndNode) {
return ManagedChannelBuilder.forAddress(serviceAndNode.getHostName(), 81).usePlaintext().build();
}
/** Invoke a function on each node, returning a list of futures in a terminal state, as per
* ExecutorService$invokeAll */
public <T> List<Future<T>> invokeAll(Function<STUB, Callable<T>> callF) throws InterruptedException {
List<Callable<T>> calls = nodeConfigurationWatcher.getQueryNodes().stream()
.map(id -> callF.apply(indexApi(id)))
.toList();
return virtualExecutorService.invokeAll(calls);
}
/** Invoke a function on each node, returning a stream of results */
public <T> Stream<T> callEachSequential(Function<STUB, T> call) {
return nodeConfigurationWatcher.getQueryNodes().stream()
.map(id -> call.apply(indexApi(id)));
}
/** Create a stub for the given channel, this is an operation
* that needs to be implemented for the particular API this
* pool is intended for
*/
public abstract STUB createStub(ManagedChannel channel);
}

View File

@ -42,6 +42,7 @@ public class QueryService extends Service {
public QueryService(BaseServiceParams params,
IndexClient indexClient,
NodeConfigurationWatcher nodeWatcher,
QueryGRPCDomainLinksService domainLinksService,
QueryGRPCService queryGRPCService,
Gson gson,
DomainBlacklist blacklist,
@ -55,6 +56,7 @@ public class QueryService extends Service {
var grpcServer = ServerBuilder.forPort(params.configuration.port() + 1)
.addService(queryGRPCService)
.addService(domainLinksService)
.build();
grpcServer.start();

View File

@ -60,7 +60,7 @@ public class CrawlDataUnfcker {
return Optional.empty();
}
try (var stream = CrawledDomainReader.createDataStream(file)) {
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, file)) {
while (stream.hasNext()) {
if (stream.next() instanceof CrawledDomain domain) {
return Optional.of(domain);

View File

@ -4,13 +4,9 @@ import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.converting.ConverterModule;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.tools.experiments.*;
import plan.CrawlPlanLoader;
import java.io.IOException;
import java.nio.file.Path;
@ -52,7 +48,7 @@ public class ExperimentRunnerMain {
Path basePath = Path.of(args[0]);
for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
Path crawlDataPath = basePath.resolve(item.relPath());
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
experiment.process(stream);
}
catch (Exception ex) {

View File

@ -22,7 +22,7 @@ public class SiteStatisticsExperiment extends Experiment {
@Override
public boolean process(SerializableCrawlDataStream stream) {
var ret = domainProcessor.process(stream);
var ret = domainProcessor.fullProcessing(stream);
ret.documents.stream()
.filter(ProcessedDocument::isProcessedFully)