2022-11 release (#133)

Co-authored-by: vlofgren <vlofgren@gmail.com>
Co-authored-by: vlofgren <vlofgren@marginalia.nu>
Co-authored-by: Viktor Lofgren <vlofgren@marginalia.nu>
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/133
This commit is contained in:
Viktor Lofgren 2023-01-08 11:13:39 +01:00
parent 06299cd554
commit 6b44786649
302 changed files with 10365 additions and 7577 deletions

View File

@ -12,10 +12,16 @@ The canonical git server for this project is [https://git.marginalia.nu](https:/
It is fine to mirror it on other hosts, but if you have issues or questions
git.marginalia.nu is where you want to go.
As it stands now, the project is still being set up and is a bit of a mess as
it wasn't developed with the intention of going open source, a lot of tests
and so on make assumptions about the directory structure, much configuration
is hard coded and so on. Please stand by. A lot of the mess is fairly superficial.
## Important note about wmsa.local
This project has a [sister repository called wmsa.local](https://git.marginalia.nu/marginalia/wmsa.local)
that contains scripts and configuration files for running and developing the code.
Without it, development is very unpleasant.
While developing the code, you will want an environment variable WMSA_HOME pointing to
the directory in which wmsa.local is checked out, otherwise the code will not run and
several tests will fail.
## Documentation

View File

@ -56,19 +56,7 @@ test {
forkEvery = 1
maxHeapSize = "8G"
useJUnitPlatform {
excludeTags "db"
excludeTags "nobuild"
}
}
task dbTest(type: Test) {
maxParallelForks = 1
forkEvery = 1
maxHeapSize = "8G"
useJUnitPlatform {
includeTags "db"
}
}

View File

@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-7.5-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists

View File

@ -4,6 +4,8 @@ plugins {
id "me.champeau.jmh" version "0.6.6"
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
repositories {
@ -63,22 +65,20 @@ dependencies {
implementation 'org.projectlombok:lombok:1.18.24'
annotationProcessor 'org.projectlombok:lombok:1.18.24'
implementation 'com.github.jknack:handlebars:4.3.0'
implementation 'com.github.jknack:handlebars:4.3.1'
implementation 'com.github.jknack:handlebars-markdown:4.2.1'
implementation group: 'com.google.code.gson', name: 'gson', version: '2.9.0'
implementation 'io.reactivex.rxjava3:rxjava:3.1.4'
implementation 'io.reactivex.rxjava3:rxjava:3.1.5'
implementation "com.sparkjava:spark-core:2.9.3"
implementation 'com.opencsv:opencsv:5.6'
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
implementation 'org.slf4j:slf4j-api:1.7.36'
testImplementation 'org.slf4j:slf4j-jdk14:2.0.3'
implementation 'com.google.guava:guava:31.1-jre'
implementation 'com.google.inject:guice:5.1.0'
@ -89,19 +89,19 @@ dependencies {
implementation group: 'com.h2database', name: 'h2', version: '2.1.210'
implementation 'org.jsoup:jsoup:1.14.3'
implementation 'org.jsoup:jsoup:1.15.3'
implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2'
implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.4'
implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.6'
implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3'
implementation 'com.zaxxer:HikariCP:5.0.1'
implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
implementation 'io.prometheus:simpleclient:0.15.0'
implementation 'io.prometheus:simpleclient_servlet:0.15.0'
implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
implementation 'io.prometheus:simpleclient_hotspot:0.15.0'
implementation 'io.prometheus:simpleclient:0.16.0'
implementation 'io.prometheus:simpleclient_servlet:0.16.0'
implementation 'io.prometheus:simpleclient_httpserver:0.16.0'
implementation 'io.prometheus:simpleclient_hotspot:0.16.0'
implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3'
implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30'
@ -114,7 +114,7 @@ dependencies {
implementation 'org.imgscalr:imgscalr-lib:4.2'
implementation 'org.jclarion:image4j:0.7'
implementation 'commons-net:commons-net:3.6'
implementation 'commons-net:commons-net:3.8.0'
implementation 'org.eclipse.jgit:org.eclipse.jgit:5.12.0.202106070339-r'
implementation 'org.eclipse.jgit:org.eclipse.jgit.ssh.jsch:5.12.0.202106070339-r'
implementation 'com.jcraft:jsch:0.1.55'
@ -123,12 +123,14 @@ dependencies {
implementation 'edu.stanford.nlp:stanford-corenlp:4.4.0'
implementation group: 'it.unimi.dsi', name: 'fastutil', version: '8.5.8'
implementation 'org.roaringbitmap:RoaringBitmap:0.9.27'
implementation 'org.roaringbitmap:RoaringBitmap:0.9.32'
implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29'
implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0'
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
testImplementation 'org.mockito:mockito-junit-jupiter:4.5.1'
testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
testCompileOnly 'org.projectlombok:lombok:1.18.24'
testImplementation 'org.projectlombok:lombok:1.18.24'
@ -136,23 +138,23 @@ dependencies {
testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1'
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.2')
testImplementation 'org.testcontainers:mariadb:1.17.2'
testImplementation "org.testcontainers:junit-jupiter:1.17.2"
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.9.0'
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
e2eTestImplementation 'org.testcontainers:nginx:1.17.3'
e2eTestImplementation 'org.testcontainers:nginx:1.17.4'
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
e2eTestImplementation 'org.testcontainers:selenium:1.17.3'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.2.1'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.2.1'
e2eTestImplementation 'org.testcontainers:selenium:1.17.4'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.5.3'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.5.3'
implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
implementation 'org.seleniumhq.selenium:selenium-java:4.3.0'
implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.5.3'
implementation 'org.seleniumhq.selenium:selenium-java:4.5.3'
implementation 'org.sejda.imageio:webp-imageio:0.1.6'
jmh 'org.openjdk.jmh:jmh-core:1.35'
@ -167,23 +169,17 @@ configurations {
}
test {
maxParallelForks = 16
forkEvery = 1
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
maxHeapSize = "8G"
useJUnitPlatform {
excludeTags "db"
}
useJUnitPlatform()
}
task dbTest(type: Test) {
maxParallelForks = 1
forkEvery = 1
task fastTests(type: Test) {
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
maxHeapSize = "8G"
useJUnitPlatform {
includeTags "db"
excludeTags "slow"
}
}
@ -243,9 +239,9 @@ task IP2LocationFile(type: Copy) {
into outputDir
}
task downloadTermFreqData(type: Copy) {
// TODO: Need hosting for this file
from '/var/lib/wmsa/model/tfreq-new-algo3.bin'
into 'data/models/'
task downloadTermFreqData(type: Download) {
src 'https://downloads.marginalia.nu/model/tfreq-new-algo3.bin'
dest file('data/models/tfreq-new-algo3.bin')
overwrite false
}

View File

@ -70,4 +70,4 @@ dating dating
EOF
echo "*** Starting $1"
WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
WMSA_HOME=${HOME} java -server -ea -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1

View File

@ -27,6 +27,7 @@ public class AndCardIntSet {
public static AndCardIntSet of(RoaringBitmap bmap) {
TIntArrayList lst = new TIntArrayList(bmap.getCardinality());
lst.addAll(bmap.toArray());
return new AndCardIntSet(lst);
@ -37,7 +38,7 @@ public class AndCardIntSet {
backingList = list;
hash = 0;
if (list.size() < 128) {
if (list.size() < 32) {
for (int v : list.toArray()) {
int bit = hasher.hashInt(v).asInt() % 64;
hash |= (1L << bit);
@ -56,7 +57,7 @@ public class AndCardIntSet {
return false;
}
if (backingList.size() < 128) {
if (backingList.size() < 32) {
int bit = hasher.hashInt(val).asInt() % 64;
hash |= (1L << bit);
}
@ -81,10 +82,10 @@ public class AndCardIntSet {
if (!testHash(a,b)) {
return 0;
}
if (a.getCardinality() + b.getCardinality() < 10) {
return andLinearSmall(a, b);
}
//
// if (a.getCardinality() + b.getCardinality() < 10) {
// return andLinearSmall(a, b);
// }
return andLinear(a,b);
}

View File

@ -1,80 +0,0 @@
package nu.marginalia.util;
public class ByteFolder {
public byte[] foldBytes(int p, int q) {
int pw = bitWidth(p);
int qw = bitWidth(q);
int qpw = qw + pw;
long qp = Integer.toUnsignedLong(q) << pw | Integer.toUnsignedLong(p);
int qpwBytes = ((qpw - 1) / Byte.SIZE) + 1;
byte[] bytes = new byte[qpwBytes + 1];
bytes[0] = (byte) pw;
for (int i = 1; i < bytes.length; i++) {
bytes[i] = (byte) (qp >>> (qpwBytes - i) * Byte.SIZE & 0xff);
}
return bytes;
}
// Function such that (decodeBytes o foldBytes) = identity
public static int[] decodeBytes(byte[] data) {
int[] dest = new int[2];
decodeBytes(data, data.length, dest);
return dest;
}
public static void decodeBytes(byte[] data, int length, int[] dest) {
long val = 0;
for (int i = 1; i < length; i++) {
val = (val << 8) | ((0xFF)&data[i]);
}
dest[1] = (int)(val >>> data[0]);
dest[0] = (int)(val & ~(dest[1]<<data[0]));
}
private static int bitWidth(int q) {
int v = Integer.numberOfLeadingZeros(q);
if (v == 32) return 1;
return 32-v;
}
public static String byteBits(byte[] b) {
return byteBits(b, b.length);
}
public static String byteBits(byte[] b, int n) {
StringBuilder s = new StringBuilder();
for (int j = 0; j < n;j++) {
if (!s.toString().isBlank()) {
s.append(":");
}
for (int i = 7; i >= 0; i--) {
s.append((b[j] & (1L << i)) > 0 ? 1 : 0);
}
}
return s.toString();
}
public static String intBits(int v) {
StringBuilder s = new StringBuilder();
for (int i = 32; i >=0; i--) {
s.append((v & (1L << i)) > 0 ? 1 : 0);
}
return s.toString();
}
public static String longBits(long v) {
StringBuilder s = new StringBuilder();
for (int i = 64; i >=0; i--) {
s.append((v & (1L << i)) > 0 ? 1 : 0);
}
return s.toString();
}
}

View File

@ -6,37 +6,32 @@ import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.ByteChannel;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
/** For managing random writes on SSDs
*
* See https://en.wikipedia.org/wiki/Write_amplification
/** For managing random writes on SSDs.
* Because SSDs do not deal well with random small writes,
* see https://en.wikipedia.org/wiki/Write_amplification,
* it is beneficial to pigeonhole the writes first
* within the same general region
* */
public class RandomWriteFunnel implements AutoCloseable {
private static final Logger logger = LoggerFactory.getLogger(RandomWriteFunnel.class);
private final DataBin[] bins;
private final ArrayList<DataBin> bins;
private final Path tempDir;
private final int binSize;
public RandomWriteFunnel(Path tempDir, long size, int binSize) throws IOException {
public RandomWriteFunnel(Path tempDir, int binSize) throws IOException {
this.binSize = binSize;
this.tempDir = tempDir;
if (size > 0) {
int binCount = (int) (size / binSize + ((size % binSize) != 0L ? 1 : 0));
bins = new DataBin[binCount];
for (int i = 0; i < binCount; i++) {
bins[i] = new DataBin(tempDir, (int)
Math.min((size - (long)binSize * i), binSize));
}
}
else {
bins = new DataBin[0];
}
bins = new ArrayList<>();
}
@SneakyThrows
@ -44,10 +39,21 @@ public class RandomWriteFunnel implements AutoCloseable {
int bin = (int)(address / binSize);
int offset = (int)(address%binSize);
bins[bin].put(offset, data);
if (bin >= bins.size()) {
grow(bin);
}
public void write(FileChannel o) throws IOException {
bins.get(bin).put(offset, data);
}
@SneakyThrows
private void grow(int bin) {
while (bins.size() <= bin) {
bins.add(new DataBin(tempDir, binSize));
}
}
public void write(ByteChannel o) throws IOException {
ByteBuffer buffer = ByteBuffer.allocateDirect(binSize*8);
for (var bin : bins) {
@ -67,7 +73,7 @@ public class RandomWriteFunnel implements AutoCloseable {
}
}
static class DataBin implements AutoCloseable {
static class DataBin {
private final ByteBuffer buffer;
private final int size;
private final FileChannel channel;
@ -77,7 +83,7 @@ public class RandomWriteFunnel implements AutoCloseable {
buffer = ByteBuffer.allocateDirect(360_000);
this.size = size;
file = Files.createTempFile(tempDir, "scatter-writer", ".dat").toFile();
channel = new RandomAccessFile(file, "rw").getChannel();
channel = (FileChannel) Files.newByteChannel(file.toPath(), StandardOpenOption.WRITE, StandardOpenOption.READ);
}
void put(int address, long data) throws IOException {
@ -133,7 +139,6 @@ public class RandomWriteFunnel implements AutoCloseable {
}
}
@Override
public void close() throws IOException {
channel.close();
file.delete();

View File

@ -0,0 +1,28 @@
package nu.marginalia.util;
import java.util.HashMap;
public class StringPool {
private final HashMap<String, String> words;
public StringPool() {
this.words = new HashMap<>(1000);
}
public StringPool(int capacity) {
words = new HashMap<>(capacity);
}
public String internalize(String str) {
final String ret = words.putIfAbsent(str, str);
if (null == ret)
return str;
return ret;
}
public void flush() {
words.clear();
}
}

View File

@ -0,0 +1,111 @@
package nu.marginalia.util;
import java.util.List;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.Predicate;
public class TransformList<T> {
private final List<T> backingList;
public TransformList(List<T> backingList) {
this.backingList = backingList;
}
public void transformEach(Consumer<Entity> consumer) {
for (var iter = backingList.listIterator(); iter.hasNext(); ) {
var entity = new Entity(iter.next());
consumer.accept(entity);
if (entity.action == Action.REPLACE) {
iter.set(entity.value);
}
else if (entity.action == Action.REMOVE) {
iter.remove();
}
}
}
public void transformEachPair(BiConsumer<Entity, Entity> consumer) {
for (var iter = backingList.listIterator(); iter.hasNext(); ) {
var firstEntity = new Entity(iter.next());
if (!iter.hasNext()) break;
var secondEntry = new Entity(backingList.get(iter.nextIndex()));
consumer.accept(firstEntity, secondEntry);
if (firstEntity.action == Action.REPLACE) {
iter.set(firstEntity.value);
if (secondEntry.action == Action.REPLACE) {
backingList.set(iter.nextIndex(), secondEntry.value);
}
else if (secondEntry.action == Action.REMOVE) {
iter.next();
iter.remove();
}
}
else if (firstEntity.action == Action.REMOVE) {
if (secondEntry.action == Action.REPLACE) {
backingList.set(iter.nextIndex(), secondEntry.value);
}
iter.remove();
if (secondEntry.action == Action.REMOVE) {
iter.next();
iter.remove();
}
}
}
}
public void scan(Predicate<T> start, Predicate<T> end, Consumer<TransformList<T>> inbetween) {
for (int i = 0; i < backingList.size(); i++) {
if (start.test(backingList.get(i))) {
for (int j = i + 1; j < backingList.size(); j++) {
if (end.test(backingList.get(j))) {
inbetween.accept(new TransformList<>(backingList.subList(i, j+1)));
break;
}
}
}
}
}
public void scanAndTransform(Predicate<T> start, Predicate<T> end, Consumer<Entity> inbetweenConsumer) {
scan(start, end, range -> range.transformEach(inbetweenConsumer));
}
public int size() {
return backingList.size();
}
public List<T> getBackingList() {
return backingList;
}
public class Entity {
public T value;
private Action action;
Entity(T value) {
this.value = value;
}
public void replace(T newValue) {
action = Action.REPLACE;
value = newValue;
}
public void remove() {
action = Action.REMOVE;
}
}
enum Action {
NO_OP,
REPLACE,
REMOVE
}
}

View File

@ -0,0 +1,64 @@
package nu.marginalia.util.array;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.algo.IntArrayBase;
import nu.marginalia.util.array.algo.IntArraySearch;
import nu.marginalia.util.array.algo.IntArraySort;
import nu.marginalia.util.array.algo.IntArrayTransformations;
import nu.marginalia.util.array.delegate.ShiftedIntArray;
import nu.marginalia.util.array.page.IntArrayPage;
import nu.marginalia.util.array.page.PagingIntArray;
import nu.marginalia.util.array.scheme.ArrayPartitioningScheme;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public interface IntArray extends IntArrayBase, IntArrayTransformations, IntArraySearch, IntArraySort {
int WORD_SIZE = 4;
ArrayPartitioningScheme DEFAULT_PARTITIONING_SCHEME
= ArrayPartitioningScheme.forPartitionSize(Integer.getInteger("wmsa.page-size",1<<30) / WORD_SIZE);
int MAX_CONTINUOUS_SIZE = Integer.MAX_VALUE/WORD_SIZE - 16;
static IntArray allocate(long size) {
if (size < MAX_CONTINUOUS_SIZE) {
return IntArrayPage.onHeap((int) size);
}
return PagingIntArray.newOnHeap(DEFAULT_PARTITIONING_SCHEME, size);
}
static IntArray mmapRead(Path path) throws IOException {
long sizeBytes = Files.size(path);
if (sizeBytes < MAX_CONTINUOUS_SIZE) {
return IntArrayPage.fromMmapReadOnly(path, 0, (int) sizeBytes / 4);
}
return PagingIntArray.mapFileReadOnly(DEFAULT_PARTITIONING_SCHEME, path);
}
static IntArray mmapForWriting(Path path) throws IOException {
return PagingIntArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path);
}
static IntArray mmapForWriting(Path path, long size) throws IOException {
return PagingIntArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path, size);
}
default ShiftedIntArray shifted(long offset) {
return new ShiftedIntArray(offset, this);
}
default ShiftedIntArray range(long start, long end) {
return new ShiftedIntArray(start, end, this);
}
void force();
void advice(NativeIO.Advice advice) throws IOException;
void advice(NativeIO.Advice advice, long start, long end) throws IOException;
}

View File

@ -0,0 +1,63 @@
package nu.marginalia.util.array;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.algo.LongArrayBase;
import nu.marginalia.util.array.algo.LongArraySearch;
import nu.marginalia.util.array.algo.LongArraySort;
import nu.marginalia.util.array.algo.LongArrayTransformations;
import nu.marginalia.util.array.delegate.ShiftedLongArray;
import nu.marginalia.util.array.page.LongArrayPage;
import nu.marginalia.util.array.page.PagingLongArray;
import nu.marginalia.util.array.scheme.ArrayPartitioningScheme;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public interface LongArray extends LongArrayBase, LongArrayTransformations, LongArraySearch, LongArraySort {
int WORD_SIZE = 8;
ArrayPartitioningScheme DEFAULT_PARTITIONING_SCHEME
= ArrayPartitioningScheme.forPartitionSize(Integer.getInteger("wmsa.page-size",1<<30) / WORD_SIZE);
int MAX_CONTINUOUS_SIZE = Integer.MAX_VALUE/WORD_SIZE - 8;
static LongArray allocate(long size) {
if (size < MAX_CONTINUOUS_SIZE) {
return LongArrayPage.onHeap((int) size);
}
return PagingLongArray.newOnHeap(DEFAULT_PARTITIONING_SCHEME, size);
}
static LongArray mmapRead(Path path) throws IOException {
long sizeBytes = Files.size(path);
if (sizeBytes < MAX_CONTINUOUS_SIZE) {
return LongArrayPage.fromMmapReadOnly(path, 0, (int) sizeBytes / 8);
}
return PagingLongArray.mapFileReadOnly(DEFAULT_PARTITIONING_SCHEME, path);
}
static LongArray mmapForWriting(Path path) throws IOException {
return PagingLongArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path);
}
static LongArray mmapForWriting(Path path, long size) throws IOException {
return PagingLongArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path, size);
}
default ShiftedLongArray shifted(long offset) {
return new ShiftedLongArray(offset, this);
}
default ShiftedLongArray range(long start, long end) {
return new ShiftedLongArray(start, end, this);
}
void force();
void advice(NativeIO.Advice advice) throws IOException;
void advice(NativeIO.Advice advice, long start, long end) throws IOException;
}

View File

@ -0,0 +1,6 @@
package nu.marginalia.util.array.algo;
public interface BulkTransferArray<BufferType> {
void set(long start, long end, BufferType buffer, int bufferStart);
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.util.array.algo;
import java.io.IOException;
import java.nio.IntBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
public interface IntArrayBase extends BulkTransferArray<IntBuffer> {
int get(long pos);
void set(long pos, int value);
long size();
default void fill(long start, long end, int val) {
for (long v = start; v < end; v++) {
set(v, val);
}
}
default void increment(long pos) {
set(pos, get(pos) + 1);
}
default void swap(long pos1, long pos2) {
int tmp = get(pos1);
set(pos1, get(pos2));
set(pos2, tmp);
}
default void swapn(int n, long pos1, long pos2) {
for (int i = 0; i < n; i++) {
int tmp = get(pos1+i);
set(pos1+i, get(pos2+i));
set(pos2+i, tmp);
}
}
default int getAndIncrement(long pos) {
int val = get(pos);
set(pos, val + 1);
return val;
}
default void set(long start, long end, IntBuffer buffer, int bufferStart) {
for (int i = 0; i < (end-start); i++) {
set(start+i, buffer.get(i + bufferStart));
}
}
default void get(long start, long end, IntBuffer buffer, int bufferStart) {
for (int i = 0; i < (end-start); i++) {
buffer.put(i + bufferStart, get(start + i));
}
}
default void get(long start, IntBuffer buffer) {
get(start, start + buffer.remaining(), buffer, buffer.position());
}
default void get(long start, long end, int[] buffer) {
for (int i = 0; i < (end-start); i++) {
buffer[i] = get(start + i);
}
}
void write(Path file) throws IOException;
void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
}

View File

@ -0,0 +1,126 @@
package nu.marginalia.util.array.algo;
import nu.marginalia.util.array.buffer.IntQueryBuffer;
import static nu.marginalia.util.array.algo.LongArraySearch.encodeSearchMiss;
public interface IntArraySearch extends IntArrayBase {
int LINEAR_SEARCH_CUTOFF = 64;
default long linearSearch(int key, long fromIndex, long toIndex) {
long pos;
for (pos = fromIndex; pos < toIndex; pos++) {
int val = get(pos);
if (val == key) return pos;
if (val > key) break;
}
return encodeSearchMiss(pos - 1);
}
default long binarySearch(int key, long fromIndex, long toIndex) {
long low = 0;
long high = (toIndex - fromIndex) - 1;
while (high - low >= LINEAR_SEARCH_CUTOFF) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
return linearSearch(key, fromIndex + low, fromIndex + high + 1);
}
default long binarySearchUpperBound(int key, long fromIndex, long toIndex) {
long low = 0;
long high = (toIndex - fromIndex) - 1;
while (high - low >= LINEAR_SEARCH_CUTOFF) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
for (fromIndex += low; fromIndex < toIndex; fromIndex++) {
if (get(fromIndex) >= key) return fromIndex;
}
return toIndex;
}
default void retain(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
if (searchStart >= searchEnd) return;
int bv = buffer.currentValue();
int av = get(searchStart);
long pos = searchStart;
while (bv <= boundary && buffer.hasMore()) {
if (bv < av) {
if (!buffer.rejectAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
else if (bv == av) {
if (!buffer.retainAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
if (++pos < searchEnd) {
av = get(pos);
}
else {
break;
}
}
}
default void reject(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
if (searchStart >= searchEnd) return;
int bv = buffer.currentValue();
int av = get(searchStart);
long pos = searchStart;
while (bv <= boundary && buffer.hasMore()) {
if (bv < av) {
if (!buffer.retainAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
else if (bv == av) {
if (!buffer.rejectAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
if (++pos < searchEnd) {
av = get(pos);
}
else {
break;
}
}
}
}

View File

@ -0,0 +1,174 @@
package nu.marginalia.util.array.algo;
import java.io.IOException;
import java.nio.IntBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public interface IntArraySort extends IntArrayBase {
default boolean isSorted(long start, long end) {
if (start == end) return true;
int val = get(start);
for (long i = start + 1; i < end; i++) {
int next = get(i);
if (next < val)
return false;
val = next;
}
return true;
}
default void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
long size = end - start;
if (size < ctx.memorySortLimit()) {
quickSort(start, end);
}
else {
mergeSort(start, end, ctx.tempDir());
}
}
default boolean isSortedN(int wordSize, long start, long end) {
if (start == end) return true;
int val = get(start);
for (long i = start + wordSize; i < end; i+=wordSize) {
int next = get(i);
if (next < val)
return false;
val = next;
}
return true;
}
default void insertionSort(long start, long end) {
assert end - start < Integer.MAX_VALUE;
int n = (int) (end - start);
if (n <= 1) {
return;
}
for (int i = 1; i < n; i++) {
int key = get(start + i);
int j = i - 1;
while (j >= 0 && get(start + j) > key) {
swap( start + j, start + (long)(j+1));
j--;
}
set(start + j+1, key);
}
}
default void quickSort(long start, long end) {
if (end - start < 64) {
insertionSort(start, end);
}
else {
_quickSortLH(start, end - 1);
}
}
default void _quickSortLH(long low, long highInclusive) {
if (low < 0 || highInclusive < 0 || low >= highInclusive)
return;
if (highInclusive - low < 32) {
insertionSort(low, highInclusive + 1);
return;
}
long p = _quickSortPartition(low, highInclusive);
_quickSortLH(low, p);
_quickSortLH(p + 1, highInclusive);
}
default long _quickSortPartition(long low, long high) {
long pivotPoint = ((low + high) / (2L));
int pivot = get(pivotPoint);
long i = low - 1;
long j = high + 1;
for (;;) {
do {
i+=1;
} while (get(i) < pivot);
do {
j-=1;
}
while (get(j) > pivot);
if (i >= j) return j;
else swap(i, j);
}
}
default void mergeSort(long start, long end, Path tmpDir) throws IOException {
int length = (int) (end - start);
Path tmpFile = Files.createTempFile(tmpDir,"sort-"+start+"-"+(start+length), ".dat");
try (var channel = (FileChannel) Files.newByteChannel(tmpFile, StandardOpenOption.WRITE, StandardOpenOption.READ)) {
var workBuffer = channel.map(FileChannel.MapMode.READ_WRITE, 0, 4L * length).asIntBuffer();
_mergeSort(start, length, workBuffer);
}
finally {
Files.delete(tmpFile);
}
}
default void _mergeSort(long start, int length, IntBuffer workBuffer) {
int width = Math.min(Integer.highestOneBit(length), 1 << 16);
// Do in-memory sorting up until internalSortLimit first
for (int i = 0; i < length; i += width) {
quickSort(start + i, start + i + Math.min(width, length-i));
}
// Then finish with merge sort
for (width = 1; width < length; width*=2) {
for (int i = 0; i < length; i += 2*width) {
_merge(start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer);
}
workBuffer.clear();
set(start, start + length, workBuffer, 0);
}
}
default void _merge(long offset, int left, int right, int end, IntBuffer workBuffer) {
long idxL = left;
long idxR = right;
for (int putPos = left; putPos < end; putPos++) {
if (idxL < right && (idxR >= end || get(offset+idxL) < get(offset+idxR))) {
workBuffer.put(putPos, get(offset+idxL));
idxL++;
}
else {
workBuffer.put(putPos, get(offset+idxR));
idxR++;
}
}
}
}

View File

@ -0,0 +1,40 @@
package nu.marginalia.util.array.algo;
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
import nu.marginalia.util.array.functional.IntIOTransformer;
import nu.marginalia.util.array.functional.IntTransformer;
import nu.marginalia.util.array.functional.LongIntConsumer;
import java.io.IOException;
public interface IntArrayTransformations extends IntArrayBase {
default void forEach(long start, long end, LongIntConsumer consumer) {
for (long i = start; i < end; i++) {
consumer.accept(i, get(i));
}
}
default void transformEach(long start, long end, IntTransformer transformer) {
for (long i = start; i < end; i++) {
set(i, transformer.transform(i, get(i)));
}
}
default void transformEachIO(long start, long end, IntIOTransformer transformer) throws IOException {
for (long i = start; i < end; i++) {
set(i, transformer.transform(i, get(i)));
}
}
default int foldIO(int zero, long start, long end, IntBinaryIOOperation operator) throws IOException {
int accumulator = zero;
for (long i = start; i < end; i++) {
accumulator = operator.apply(accumulator, get(i));
}
return accumulator;
}
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.util.array.algo;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
long get(long pos);
void set(long pos, long value);
long size();
default void fill(long start, long end, long val) {
for (long v = start; v < end; v++) {
set(v, val);
}
}
default void increment(long pos) {
set(pos, get(pos) + 1);
}
default void swap(long pos1, long pos2) {
long tmp = get(pos1);
set(pos1, get(pos2));
set(pos2, tmp);
}
default void swapn(int n, long pos1, long pos2) {
for (int i = 0; i < n; i++) {
long tmp = get(pos1+i);
set(pos1+i, get(pos2+i));
set(pos2+i, tmp);
}
}
default long getAndIncrement(long pos) {
long val = get(pos);
set(pos, val + 1);
return val;
}
default void set(long start, long end, LongBuffer buffer, int bufferStart) {
for (int i = 0; i < (end-start); i++) {
set(start+i, buffer.get(i + bufferStart));
}
}
default void get(long start, long end, LongBuffer buffer, int bufferStart) {
for (int i = 0; i < (end-start); i++) {
buffer.put(i + bufferStart, get(start + i));
}
}
default void get(long start, LongBuffer buffer) {
get(start, start + buffer.remaining(), buffer, buffer.position());
}
default void get(long start, long end, long[] buffer) {
for (long i = 0; i < (end-start); i++) {
buffer[(int) i] = get(start + i);
}
}
void write(Path file) throws IOException;
void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
}

View File

@ -0,0 +1,263 @@
package nu.marginalia.util.array.algo;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
public interface LongArraySearch extends LongArrayBase {
int LINEAR_SEARCH_CUTOFF = 32;
default long linearSearch(long key, long fromIndex, long toIndex) {
long pos;
for (pos = fromIndex; pos < toIndex; pos++) {
long val = get(pos);
if (val == key) return pos;
if (val > key) break;
}
return encodeSearchMiss(pos - 1);
}
default long linearSearchUpperBound(long key, long fromIndex, long toIndex) {
for (long pos = fromIndex; pos < toIndex; pos++) {
if (get(pos) >= key) return pos;
}
return toIndex;
}
default long linearSearchN(int sz, long key, long fromIndex, long toIndex) {
long pos;
for (pos = fromIndex; pos < toIndex; pos+=sz) {
long val = get(pos);
if (val == key) return pos;
if (val > key) return encodeSearchMiss(pos);
}
return encodeSearchMiss(toIndex - sz);
}
default long binarySearch(long key, long fromIndex, long toIndex) {
long low = 0;
long high = (toIndex - fromIndex) - 1;
while (high - low >= LINEAR_SEARCH_CUTOFF) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
return linearSearch(key, fromIndex + low, fromIndex + high + 1);
}
default long binarySearchN(int sz, long key, long fromIndex, long toIndex) {
long low = 0;
long high = (toIndex - fromIndex)/sz - 1;
while (high - low >= LINEAR_SEARCH_CUTOFF) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + sz*mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + sz*mid;
}
for (fromIndex += low*sz; fromIndex < toIndex; fromIndex+=sz) {
long val = get(fromIndex);
if (val == key) return fromIndex;
if (val > key) return encodeSearchMiss(fromIndex);
}
return encodeSearchMiss(toIndex - sz);
}
default long binarySearchUpperBound(long key, long fromIndex, long toIndex) {
long low = 0;
long high = (toIndex - fromIndex) - 1;
while (high - low >= LINEAR_SEARCH_CUTOFF) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
for (fromIndex += low; fromIndex < toIndex; fromIndex++) {
if (get(fromIndex) >= key) return fromIndex;
}
return toIndex;
}
default long binarySearchUpperBoundN(int sz, long key, long fromIndex, long toIndex) {
long low = 0;
long high = (toIndex - fromIndex)/sz - 1;
while (high - low >= LINEAR_SEARCH_CUTOFF) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + sz*mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + sz*mid;
}
for (fromIndex += low; fromIndex < toIndex; fromIndex+=sz) {
if (get(fromIndex) >= key) return fromIndex;
}
return toIndex;
}
default void retain(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
if (searchStart >= searchEnd) return;
long bv = buffer.currentValue();
long av = get(searchStart);
long pos = searchStart;
while (bv <= boundary && buffer.hasMore()) {
if (bv < av) {
if (!buffer.rejectAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
else if (bv == av) {
if (!buffer.retainAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
if (++pos < searchEnd) {
av = get(pos);
}
else {
break;
}
}
}
default void retainN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
if (searchStart >= searchEnd) return;
long bv = buffer.currentValue();
long av = get(searchStart);
long pos = searchStart;
while (bv <= boundary && buffer.hasMore()) {
if (bv < av) {
if (!buffer.rejectAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
else if (bv == av) {
if (!buffer.retainAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
pos += sz;
if (pos < searchEnd) {
av = get(pos);
}
else {
break;
}
}
}
default void reject(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
if (searchStart >= searchEnd) return;
long bv = buffer.currentValue();
long av = get(searchStart);
long pos = searchStart;
while (bv <= boundary && buffer.hasMore()) {
if (bv < av) {
if (!buffer.retainAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
else if (bv == av) {
if (!buffer.rejectAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
if (++pos < searchEnd) {
av = get(pos);
}
else {
break;
}
}
}
default void rejectN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
if (searchStart >= searchEnd) return;
long bv = buffer.currentValue();
long av = get(searchStart);
long pos = searchStart;
while (bv <= boundary && buffer.hasMore()) {
if (bv < av) {
if (!buffer.retainAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
else if (bv == av) {
if (!buffer.rejectAndAdvance()) break;
bv = buffer.currentValue();
continue;
}
pos += sz;
if (pos < searchEnd) {
av = get(pos);
}
else {
break;
}
}
}
static long encodeSearchMiss(long value) {
return -1 - value;
}
static long decodeSearchMiss(long value) {
return -value - 1;
}
}

View File

@ -0,0 +1,325 @@
package nu.marginalia.util.array.algo;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public interface LongArraySort extends LongArrayBase {
default boolean isSorted(long start, long end) {
if (start == end) return true;
long val = get(start);
for (long i = start + 1; i < end; i++) {
long next = get(i);
if (next < val)
return false;
val = next;
}
return true;
}
default void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
long size = end - start;
if (size < ctx.memorySortLimit()) {
quickSort(start, end);
}
else {
mergeSort(start, end, ctx.tempDir());
}
}
default void sortLargeSpanN(SortingContext ctx, int sz, long start, long end) throws IOException {
if (sz == 1) {
sortLargeSpan(ctx, start, end);
return;
}
long size = end - start;
if (size < ctx.memorySortLimit()) {
quickSortN(sz, start, end);
}
else {
mergeSortN(sz, start, end, ctx.tempDir());
}
}
default boolean isSortedN(int wordSize, long start, long end) {
if (start == end) return true;
long val = get(start);
for (long i = start + wordSize; i < end; i+=wordSize) {
long next = get(i);
if (next < val)
return false;
val = next;
}
return true;
}
default void insertionSort(long start, long end) {
assert end - start < Integer.MAX_VALUE;
int n = (int) (end - start);
if (n <= 1) {
return;
}
for (int i = 1; i < n; i++) {
long key = get(start + i);
int j = i - 1;
while (j >= 0 && get(start + j) > key) {
swap( start + j, start + (long)(j+1));
j--;
}
set(start + j+1, key);
}
}
default void insertionSortN(int sz, long start, long end) {
assert end - start < Integer.MAX_VALUE;
int span = (int) (end - start);
assert (span % sz) == 0;
if (span <= sz) {
return;
}
for (int i = 1; i < span / sz; i++) {
long key = get(start + (long) i * sz);
int j = i - 1;
while (j >= 0 && get(start + (long)sz*j) > key) {
swapn(sz, start + (long)sz*j, start + (long)sz*(j+1));
j--;
}
set(start + (long) (j+1) * sz, key);
}
}
default void quickSort(long start, long end) {
if (end - start < 64) {
insertionSort(start, end);
}
else {
_quickSortLH(start, end - 1);
}
}
default void quickSortN(int wordSize, long start, long end) {
assert ((end - start) % wordSize) == 0;
if (end == start)
return;
_quickSortLHN(wordSize, start, end - wordSize);
}
default void _quickSortLHN(int wordSize, long low, long highInclusive) {
if (low < 0 || highInclusive < 0 || low >= highInclusive)
return;
if (highInclusive - low < 32L*wordSize) {
insertionSortN(wordSize, low, highInclusive + wordSize);
return;
}
long p = _quickSortPartitionN(wordSize, low, highInclusive);
_quickSortLHN(wordSize, low, p);
_quickSortLHN(wordSize, p + wordSize, highInclusive);
}
default void _quickSortLH(long low, long highInclusive) {
if (low < 0 || highInclusive < 0 || low >= highInclusive)
return;
if (highInclusive - low < 32) {
insertionSort(low, highInclusive + 1);
return;
}
long p = _quickSortPartition(low, highInclusive);
_quickSortLH(low, p);
_quickSortLH(p + 1, highInclusive);
}
default long _quickSortPartition(long low, long high) {
long pivotPoint = ((low + high) / (2L));
long pivot = get(pivotPoint);
long i = low - 1;
long j = high + 1;
for (;;) {
do {
i+=1;
} while (get(i) < pivot);
do {
j-=1;
}
while (get(j) > pivot);
if (i >= j) return j;
else swap(i, j);
}
}
default long _quickSortPartitionN(int wordSize, long low, long high) {
long pivotPoint = ((low + high) / (2L*wordSize)) * wordSize;
long pivot = get(pivotPoint);
long i = low - wordSize;
long j = high + wordSize;
for (;;) {
do {
i+=wordSize;
}
while (get(i) < pivot);
do {
j-=wordSize;
}
while (get(j) > pivot);
if (i >= j) return j;
else swapn(wordSize, i, j);
}
}
default void _mergeSortN(int wordSize, long start, int length, LongBuffer workBuffer) throws IOException {
int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(workBuffer.capacity()));
// Do in-memory sorting up until internalSortLimit first
for (int i = 0; i < length; i += width) {
quickSortN(wordSize, start + i, start + i + Math.min(width, length-i));
}
// Then finish with merge sort
for (; width < length; width*=2) {
for (int i = 0; i < length; i += 2*width) {
_mergeN(wordSize, start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer);
}
workBuffer.clear();
set(start, start + length, workBuffer, 0);
}
}
default void mergeSortN(int wordSize, long start, long end, Path tmpDir) throws IOException {
int length = (int) (end - start);
assert (length % wordSize) == 0;
Path tmpFile = Files.createTempFile(tmpDir,"sort-"+start+"-"+(start+length), ".dat");
try (var channel = (FileChannel) Files.newByteChannel(tmpFile, StandardOpenOption.WRITE, StandardOpenOption.READ)) {
var workBuffer = channel.map(FileChannel.MapMode.READ_WRITE, 0, 8L * length).asLongBuffer();
_mergeSortN(wordSize, start, length, workBuffer);
}
finally {
Files.delete(tmpFile);
}
}
default void mergeSort(long start, long end, Path tmpDir) throws IOException {
int length = (int) (end - start);
Path tmpFile = Files.createTempFile(tmpDir,"sort-"+start+"-"+(start+length), ".dat");
try (var channel = (FileChannel) Files.newByteChannel(tmpFile, StandardOpenOption.WRITE, StandardOpenOption.READ)) {
var workBuffer = channel.map(FileChannel.MapMode.READ_WRITE, 0, 8L * length).asLongBuffer();
_mergeSort(start, length, workBuffer);
}
finally {
Files.delete(tmpFile);
}
}
default void _mergeSort(long start, int length, LongBuffer workBuffer) {
int width = Math.min(Integer.highestOneBit(length), 1 << 16);
// Do in-memory sorting up until internalSortLimit first
for (int i = 0; i < length; i += width) {
quickSort(start + i, start + i + Math.min(width, length-i));
}
// Then finish with merge sort
for (width = 1; width < length; width*=2) {
for (int i = 0; i < length; i += 2*width) {
_merge(start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer);
}
workBuffer.clear();
set(start, start + length, workBuffer, 0);
}
}
default void _mergeN(int wordSize, long offset, int left, int right, int end, LongBuffer workBuffer) {
long idxL = left;
long idxR = right;
for (int putPos = left; putPos < end; putPos+= wordSize) {
if (idxL < right && (idxR >= end || get(offset+idxL) < get(offset+idxR))) {
workBuffer.put(putPos, get(offset+idxL));
for (int s = 1; s < wordSize; s++) {
workBuffer.put(putPos + s, get(offset + idxL + s));
}
idxL+= wordSize;
}
else {
workBuffer.put(putPos, get(offset+idxR));
for (int s = 1; s < wordSize; s++) {
workBuffer.put(putPos + s, get(offset + idxR + s));
}
idxR+= wordSize;
}
}
}
default void _merge(long offset, int left, int right, int end, LongBuffer workBuffer) {
long idxL = left;
long idxR = right;
for (int putPos = left; putPos < end; putPos++) {
if (idxL < right && (idxR >= end || get(offset+idxL) < get(offset+idxR))) {
workBuffer.put(putPos, get(offset+idxL));
idxL++;
}
else {
workBuffer.put(putPos, get(offset+idxR));
idxR++;
}
}
}
}

View File

@ -0,0 +1,40 @@
package nu.marginalia.util.array.algo;
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
import nu.marginalia.util.array.functional.LongIOTransformer;
import nu.marginalia.util.array.functional.LongLongConsumer;
import nu.marginalia.util.array.functional.LongTransformer;
import java.io.IOException;
public interface LongArrayTransformations extends LongArrayBase {
default void forEach(long start, long end, LongLongConsumer consumer) {
for (long i = start; i < end; i++) {
consumer.accept(i, get(i));
}
}
default void transformEach(long start, long end, LongTransformer transformer) {
for (long i = start; i < end; i++) {
set(i, transformer.transform(i, get(i)));
}
}
default void transformEachIO(long start, long end, LongIOTransformer transformer) throws IOException {
for (long i = start; i < end; i++) {
set(i, transformer.transform(i, get(i)));
}
}
default long foldIO(long zero, long start, long end, LongBinaryIOOperation operator) throws IOException {
long accumulator = zero;
for (long i = start; i < end; i++) {
accumulator = operator.apply(accumulator, get(i));
}
return accumulator;
}
}

View File

@ -0,0 +1,6 @@
package nu.marginalia.util.array.algo;
import java.nio.file.Path;
public record SortingContext(Path tempDir, int memorySortLimit) {
}

View File

@ -0,0 +1,112 @@
package nu.marginalia.util.array.buffer;
import java.util.Arrays;
public class IntQueryBuffer {
public final int[] data;
public int end;
private int read = 0;
private int write = 0;
public IntQueryBuffer(int size) {
this.data = new int[size];
this.end = size;
}
public IntQueryBuffer(int [] data, int size) {
this.data = data;
this.end = size;
}
public int[] copyData() {
return Arrays.copyOf(data, end);
}
public boolean isEmpty() {
return end == 0;
}
public int size() {
return end;
}
public int currentValue() {
return data[read];
}
public boolean rejectAndAdvance() {
return ++read < end;
}
public boolean retainAndAdvance() {
if (read != write) {
int tmp = data[write];
data[write] = data[read];
data[read] = tmp;
}
write++;
return ++read < end;
}
public boolean hasMore() {
return read < end;
}
public void finalizeFiltering() {
end = write;
read = 0;
write = 0;
}
public void startFilterForRange(int pos, int end) {
read = write = pos;
this.end = end;
}
public void reset() {
end = data.length;
read = 0;
write = 0;
}
public void zero() {
end = 0;
read = 0;
write = 0;
Arrays.fill(data, 0);
}
public void uniq() {
if (end <= 1) return;
int prev = currentValue();
retainAndAdvance();
while (hasMore()) {
int val = currentValue();
if (prev == val) {
rejectAndAdvance();
} else {
retainAndAdvance();
prev = val;
}
}
finalizeFiltering();
}
public String toString() {
return getClass().getSimpleName() + "[" +
"read = " + read +
",write = " + write +
",end = " + end +
",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]";
}
}

View File

@ -1,62 +1,32 @@
package nu.marginalia.util.btree;
package nu.marginalia.util.array.buffer;
import java.util.Arrays;
public class BTreeQueryBuffer {
public class LongQueryBuffer {
public final long[] data;
public int end;
private int read = 0;
private int write = 0;
public BTreeQueryBuffer(int size) {
public LongQueryBuffer(int size) {
this.data = new long[size];
this.end = size;
}
public BTreeQueryBuffer(long [] data, int size) {
public LongQueryBuffer(long [] data, int size) {
this.data = data;
this.end = size;
}
private BTreeQueryBuffer(long [] data) {
this.data = data;
this.end = data.length;
}
public BTreeQueryBuffer[] split(int... splitPoints) {
BTreeQueryBuffer[] ret = new BTreeQueryBuffer[splitPoints.length+1];
ret[0] = new BTreeQueryBuffer(Arrays.copyOfRange(data, 0, splitPoints[0]));
for (int i = 1; i < splitPoints.length; i++) {
ret[i] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[i-1], splitPoints[i]));
}
ret[ret.length-1] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[splitPoints.length-1], end));
return ret;
}
public void gather(BTreeQueryBuffer... buffers) {
int start = 0;
for (var buffer : buffers) {
System.arraycopy(buffer.data, 0, data, start, buffer.end);
start += buffer.end;
}
this.read = 0;
this.write = 0;
this.end = start;
public boolean hasRetainedData() {
return write > 0;
}
public long[] copyData() {
return Arrays.copyOf(data, end);
}
public void retainAll() {
read = write = end;
}
public boolean isEmpty() {
return end == 0;
}

View File

@ -0,0 +1,58 @@
package nu.marginalia.util.array.delegate;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.IntArray;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
public class ReferenceImplIntArrayDelegate implements IntArray {
private final IntArray delegate;
public ReferenceImplIntArrayDelegate(IntArray delegate) {
this.delegate = delegate;
}
@Override
public int get(long pos) {
return delegate.get(pos);
}
@Override
public void set(long pos, int value) {
delegate.set(pos, value);
}
@Override
public long size() {
return delegate.size();
}
@Override
public void write(Path file) throws IOException {
delegate.write(file);
}
@Override
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
delegate.transferFrom(source, sourceStart, arrayStart, arrayEnd);
}
@Override
public void force() {
delegate.force();
}
@Override
public void advice(NativeIO.Advice advice) throws IOException {
delegate.advice(advice);
}
@Override
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
delegate.advice(advice, start, end);
}
}

View File

@ -0,0 +1,58 @@
package nu.marginalia.util.array.delegate;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.LongArray;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
public class ReferenceImplLongArrayDelegate implements LongArray {
private final LongArray delegate;
public ReferenceImplLongArrayDelegate(LongArray delegate) {
this.delegate = delegate;
}
@Override
public long get(long pos) {
return delegate.get(pos);
}
@Override
public void set(long pos, long value) {
delegate.set(pos, value);
}
@Override
public long size() {
return delegate.size();
}
@Override
public void write(Path file) throws IOException {
delegate.write(file);
}
@Override
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
delegate.transferFrom(source, sourceStart, arrayStart, arrayEnd);
}
@Override
public void force() {
delegate.force();
}
@Override
public void advice(NativeIO.Advice advice) throws IOException {
delegate.advice(advice);
}
@Override
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
delegate.advice(advice, start, end);
}
}

View File

@ -0,0 +1,199 @@
package nu.marginalia.util.array.delegate;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.IntArray;
import nu.marginalia.util.array.buffer.IntQueryBuffer;
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
import nu.marginalia.util.array.functional.IntIOTransformer;
import nu.marginalia.util.array.functional.IntTransformer;
import nu.marginalia.util.array.functional.LongIntConsumer;
import java.io.IOException;
import java.nio.IntBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
public class ShiftedIntArray implements IntArray {
public final long shift;
public final long size;
private final IntArray delegate;
public ShiftedIntArray(long shift, IntArray delegate) {
this.shift = shift;
this.size = delegate.size() - shift;
this.delegate = delegate;
}
public ShiftedIntArray(long start, long end, IntArray delegate) {
this.shift = start;
this.size = end - start;
this.delegate = delegate;
}
@Override
public int get(long pos) {
return delegate.get(pos+shift);
}
@Override
public void set(long pos, int value) {
delegate.set(pos+shift, value);
}
@Override
public void set(long start, long end, IntBuffer buffer, int bufferStart) {
delegate.set(shift + start, shift + end, buffer, bufferStart);
}
@Override
public void get(long start, long end, IntBuffer buffer, int bufferStart) {
delegate.get(shift + start, shift + end, buffer, bufferStart);
}
@Override
public void get(long start, IntBuffer buffer) {
delegate.get(shift + start, buffer);
}
@Override
public void get(long start, long end, int[] buffer) {
delegate.get(shift+start, shift+end, buffer);
}
@Override
public long size() {
return size;
}
@Override
public void write(Path file) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public ShiftedIntArray shifted(long offset) {
return new ShiftedIntArray(shift+offset, delegate);
}
@Override
public ShiftedIntArray range(long start, long end) {
return new ShiftedIntArray(shift + start, shift+end, delegate);
}
public int[] toArray() {
int[] ret = new int[(int) size];
for (int i = 0; i < size; i++) {
ret[i] = delegate.get(shift + i);
}
return ret;
}
public boolean isSorted() {
return isSorted(0, size);
}
public boolean isSorted(long start, long end) {
return delegate.isSorted(shift + start, shift + end);
}
public long search(int key) {
if (size < 128) {
return linearSearch(key);
}
else {
return binarySearch(key);
}
}
public long linearSearch(int key) {
return linearSearch(key, 0, size);
}
public long binarySearch(int key) {
return binarySearch(key, 0, size);
}
public long binarySearchUpperbound(int key) {
return binarySearchUpperBound(key, 0, size);
}
public void retain(IntQueryBuffer buffer, long boundary) {
retain(buffer, boundary, 0, size);
}
public void reject(IntQueryBuffer buffer, long boundary) {
reject(buffer, boundary, 0, size);
}
@Override
public long linearSearch(int key, long fromIndex, long toIndex) {
return translateSearchResult(delegate.linearSearch(key, fromIndex + shift, toIndex+shift));
}
@Override
public long binarySearch(int key, long fromIndex, long toIndex) {
return translateSearchResult(delegate.binarySearch(key, fromIndex + shift, toIndex+shift));
}
@Override
public long binarySearchUpperBound(int key, long fromIndex, long toIndex) {
return translateSearchResult(delegate.binarySearchUpperBound(key, fromIndex + shift, toIndex+shift));
}
private long translateSearchResult(long ret) {
if (ret > 0) return ret - shift;
return ret + shift;
}
@Override
public void retain(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
delegate.retain(buffer, boundary, searchStart + shift, searchEnd + shift);
}
@Override
public void reject(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
delegate.reject(buffer, boundary, searchStart + shift, searchEnd + shift);
}
@Override
public void forEach(long start, long end, LongIntConsumer consumer) {
delegate.forEach(start + shift, end+shift, (pos, old) -> consumer.accept(pos-shift, old));
}
@Override
public void transformEach(long start, long end, IntTransformer transformer) {
delegate.transformEach(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old));
}
@Override
public void transformEachIO(long start, long end, IntIOTransformer transformer) throws IOException {
delegate.transformEachIO(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old));
}
@Override
public int foldIO(int zero, long start, long end, IntBinaryIOOperation operator) throws IOException {
return delegate.foldIO(zero, start + shift, end+shift, operator);
}
@Override
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
delegate.transferFrom(source, sourceStart, shift + arrayStart, shift + arrayEnd);
}
@Override
public void force() {
delegate.force();
}
@Override
public void advice(NativeIO.Advice advice) throws IOException {
delegate.advice(advice, shift, shift + size());
}
@Override
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
delegate.advice(advice, start + shift, end + shift);
}
}

View File

@ -0,0 +1,255 @@
package nu.marginalia.util.array.delegate;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.array.algo.LongArraySearch;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
import nu.marginalia.util.array.functional.LongIOTransformer;
import nu.marginalia.util.array.functional.LongLongConsumer;
import nu.marginalia.util.array.functional.LongTransformer;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
public class ShiftedLongArray implements LongArray {
public final long shift;
public final long size;
private final LongArray delegate;
public ShiftedLongArray(long shift, LongArray delegate) {
this.shift = shift;
this.size = delegate.size() - shift;
this.delegate = delegate;
}
public ShiftedLongArray(long start, long end, LongArray delegate) {
this.shift = start;
this.size = end - start;
this.delegate = delegate;
}
@Override
public long get(long pos) {
return delegate.get(pos+shift);
}
@Override
public void set(long pos, long value) {
delegate.set(pos+shift, value);
}
@Override
public void set(long start, long end, LongBuffer buffer, int bufferStart) {
delegate.set(shift + start, shift + end, buffer, bufferStart);
}
@Override
public void get(long start, long end, LongBuffer buffer, int bufferStart) {
delegate.get(shift + start, shift + end, buffer, bufferStart);
}
@Override
public void get(long start, LongBuffer buffer) {
delegate.get(shift + start, buffer);
}
@Override
public void get(long start, long end, long[] buffer) {
delegate.get(shift+start, shift+end, buffer);
}
@Override
public long size() {
return size;
}
@Override
public void write(Path file) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public ShiftedLongArray shifted(long offset) {
return new ShiftedLongArray(shift+offset, delegate);
}
@Override
public ShiftedLongArray range(long start, long end) {
return new ShiftedLongArray(shift + start, shift+end, delegate);
}
public long[] toArray() {
long[] ret = new long[(int) size];
for (int i = 0; i < size; i++) {
ret[i] = delegate.get(shift + i);
}
return ret;
}
public boolean isSorted() {
return isSorted(0, size);
}
public boolean isSortedN(int sz) {
return isSortedN(sz, 0, size);
}
public boolean isSorted(long start, long end) {
return delegate.isSorted(shift + start, shift + end);
}
public boolean isSortedN(int sz, long start, long end) {
return delegate.isSortedN(sz, shift + start, shift + end);
}
public long searchN(int sz, long key) {
if (size < 128) {
return linearSearchN(sz, key);
}
else {
return binarySearchN(sz, key);
}
}
public long search(long key) {
if (size < 128) {
return linearSearch(key);
}
else {
return binarySearch(key);
}
}
public long linearSearch(long key) {
return linearSearch(key, 0, size);
}
public long binarySearch(long key) {
return binarySearch(key, 0, size);
}
public long binarySearchN(int sz, long key) {
return binarySearchN(sz, key, 0, size);
}
public long linearSearchN(int sz, long key) {
return linearSearchN(sz, key, 0, size);
}
public void retain(LongQueryBuffer buffer, long boundary) {
retain(buffer, boundary, 0, size);
}
public void retainN(LongQueryBuffer buffer, int sz, long boundary) {
if (sz == 1)
retain(buffer, boundary, 0, size);
else
retainN(buffer, sz, boundary, 0, size);
}
public void reject(LongQueryBuffer buffer, long boundary) {
reject(buffer, boundary, 0, size);
}
public void rejectN(LongQueryBuffer buffer, int sz, long boundary) {
if (sz == 1)
reject(buffer, boundary, 0, size);
else
rejectN(buffer, sz, boundary, 0, size);
}
@Override
public long linearSearch(long key, long fromIndex, long toIndex) {
return translateSearchResult(delegate.linearSearch(key, fromIndex + shift, toIndex+shift));
}
@Override
public long linearSearchN(int sz, long key, long fromIndex, long toIndex) {
return translateSearchResult(delegate.linearSearch(key, fromIndex + shift, toIndex+shift));
}
@Override
public long binarySearch(long key, long fromIndex, long toIndex) {
return translateSearchResult(delegate.binarySearch(key, fromIndex + shift, toIndex+shift));
}
@Override
public long binarySearchN(int sz, long key, long fromIndex, long toIndex) {
return translateSearchResult(delegate.binarySearchN(sz, key, fromIndex + shift, toIndex+shift));
}
@Override
public long binarySearchUpperBound(long key, long fromIndex, long toIndex) {
return translateSearchResult(delegate.binarySearchUpperBound(key, fromIndex + shift, toIndex+shift));
}
@Override
public long linearSearchUpperBound(long key, long fromIndex, long toIndex) {
return translateSearchResult(delegate.linearSearchUpperBound(key, fromIndex + shift, toIndex+shift));
}
@Override
public long binarySearchUpperBoundN(int sz, long key, long fromIndex, long toIndex) {
return translateSearchResult(delegate.binarySearchUpperBoundN(sz, key, fromIndex + shift, toIndex+shift));
}
private long translateSearchResult(long delegatedIdx) {
long ret;
if (delegatedIdx >= 0) ret = delegatedIdx - shift;
else ret = LongArraySearch.encodeSearchMiss(Math.max(0, LongArraySearch.decodeSearchMiss(delegatedIdx) - shift));
return ret;
}
public void retain(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
delegate.retain(buffer, boundary, searchStart + shift, searchEnd + shift);
}
public void retainN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
delegate.retainN(buffer, sz, boundary, searchStart + shift, searchEnd + shift);
}
public void reject(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
delegate.reject(buffer, boundary, searchStart + shift, searchEnd + shift);
}
public void rejectN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
delegate.rejectN(buffer, sz, boundary, searchStart + shift, searchEnd + shift);
}
@Override
public void forEach(long start, long end, LongLongConsumer consumer) {
delegate.forEach(start + shift, end+shift, (pos, old) -> consumer.accept(pos-shift, old));
}
@Override
public void transformEach(long start, long end, LongTransformer transformer) {
delegate.transformEach(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old));
}
@Override
public void transformEachIO(long start, long end, LongIOTransformer transformer) throws IOException {
delegate.transformEachIO(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old));
}
@Override
public long foldIO(long zero, long start, long end, LongBinaryIOOperation operator) throws IOException {
return delegate.foldIO(zero, start + shift, end+shift, operator);
}
@Override
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
delegate.transferFrom(source, sourceStart, shift + arrayStart, shift + arrayEnd);
}
@Override
public void force() {
delegate.force();
}
@Override
public void advice(NativeIO.Advice advice) throws IOException {
delegate.advice(advice, shift, shift + size());
}
@Override
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
delegate.advice(advice, start + shift, end + shift);
}
}

View File

@ -0,0 +1,5 @@
package nu.marginalia.util.array.functional;
public interface AddressRangeCall<T> {
void apply(T array, int start, int end);
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.util.array.functional;
import java.io.IOException;
public interface AddressRangeCallIO<T> {
void apply(T array, int start, int end) throws IOException;
}

View File

@ -0,0 +1,5 @@
package nu.marginalia.util.array.functional;
public interface AddressRangeIntFunction<T> {
int apply(T array, int start, int end);
}

View File

@ -0,0 +1,5 @@
package nu.marginalia.util.array.functional;
public interface AddressRangeLongFunction<T> {
long apply(T array, int start, int end);
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.util.array.functional;
import java.io.IOException;
public interface IntBinaryIOOperation {
int apply(int left, int right) throws IOException;
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.util.array.functional;
import java.io.IOException;
public interface IntIOTransformer {
int transform(long pos, int old) throws IOException;
}

View File

@ -0,0 +1,5 @@
package nu.marginalia.util.array.functional;
public interface IntTransformer {
int transform(long pos, int old);
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.util.array.functional;
import java.io.IOException;
public interface LongBinaryIOOperation {
long apply(long left, long right) throws IOException;
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.util.array.functional;
import java.io.IOException;
public interface LongIOTransformer {
long transform(long pos, long old) throws IOException;
}

View File

@ -0,0 +1,5 @@
package nu.marginalia.util.array.functional;
public interface LongIntConsumer {
void accept(long pos, int val);
}

View File

@ -0,0 +1,5 @@
package nu.marginalia.util.array.functional;
public interface LongLongConsumer {
void accept(long pos, long val);
}

View File

@ -0,0 +1,5 @@
package nu.marginalia.util.array.functional;
public interface LongTransformer {
long transform(long pos, long old);
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.util.array.functor;
import nu.marginalia.util.array.functional.AddressRangeCallIO;
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
import nu.marginalia.util.array.page.IntArrayPage;
import java.io.IOException;
public class IntIOFolder implements AddressRangeCallIO<IntArrayPage> {
public int acc;
private final IntBinaryIOOperation operator;
public IntIOFolder(int zero, IntBinaryIOOperation operator) {
this.acc = zero;
this.operator = operator;
}
public void apply(IntArrayPage array, int start, int end) throws IOException {
acc = array.foldIO(acc, start, end, operator);
}
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.util.array.functor;
import nu.marginalia.util.array.functional.AddressRangeCallIO;
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
import nu.marginalia.util.array.page.LongArrayPage;
import java.io.IOException;
public class LongIOFolder implements AddressRangeCallIO<LongArrayPage> {
public long acc;
private final LongBinaryIOOperation operator;
public LongIOFolder(long zero, LongBinaryIOOperation operator) {
this.acc = zero;
this.operator = operator;
}
public void apply(LongArrayPage array, int start, int end) throws IOException {
acc = array.foldIO(acc, start, end, operator);
}
}

View File

@ -0,0 +1,88 @@
package nu.marginalia.util.array.page;
import nu.marginalia.util.array.algo.BulkTransferArray;
import nu.marginalia.util.array.functional.AddressRangeCall;
import nu.marginalia.util.array.functional.AddressRangeCallIO;
import nu.marginalia.util.array.scheme.ArrayPartitioningScheme;
import java.io.IOException;
import static nu.marginalia.util.array.algo.LongArraySearch.decodeSearchMiss;
import static nu.marginalia.util.array.algo.LongArraySearch.encodeSearchMiss;
public class AbstractPagingArray<T extends BulkTransferArray<B>, B> {
final T[] pages;
final long size;
final ArrayPartitioningScheme partitioningScheme;
public AbstractPagingArray(ArrayPartitioningScheme partitioningScheme, T[] pages, long size) {
this.partitioningScheme = partitioningScheme;
this.pages = pages;
this.size = size;
}
void delegateToEachPage(long start, long end, AddressRangeCall<T> fn) {
assert end >= start;
int page = partitioningScheme.getPage(start);
long endPos;
for (long pos = start; pos < end; pos = endPos) {
endPos = partitioningScheme.getPageEnd(pos, end);
int sOff = partitioningScheme.getOffset(pos);
int eOff = partitioningScheme.getEndOffset(start, endPos);
fn.apply(pages[page++], sOff, eOff);
}
}
void delegateToEachPageIO(long start, long end, AddressRangeCallIO<T> fn) throws IOException {
assert end >= start;
int page = partitioningScheme.getPage(start);
long endPos;
for (long pos = start; pos < end; pos = endPos) {
endPos = partitioningScheme.getPageEnd(pos, end);
int sOff = partitioningScheme.getOffset(pos);
int eOff = partitioningScheme.getEndOffset(start, endPos);
fn.apply(pages[page++], sOff, eOff);
}
}
long translateSearchResultsFromPage(long fromIndex, long ret) {
int page = partitioningScheme.getPage(fromIndex);
if (ret >= 0) {
return partitioningScheme.toRealIndex(page, (int) ret);
} else {
ret = decodeSearchMiss(ret);
ret = partitioningScheme.toRealIndex(page, (int) ret);
return encodeSearchMiss(ret);
}
}
public void set(long start, long end, B buffer, int bufferStart) {
assert end >= start;
int page = partitioningScheme.getPage(start);
long endPos;
for (long pos = start; pos < end; pos = endPos) {
endPos = partitioningScheme.getPageEnd(pos, end);
int sOff = partitioningScheme.getOffset(pos);
int eOff = partitioningScheme.getEndOffset(start, endPos);
pages[page++].set(sOff, eOff, buffer, bufferStart);
bufferStart += eOff - sOff;
}
}
}

View File

@ -0,0 +1,120 @@
package nu.marginalia.util.array.page;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.IntArray;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class IntArrayPage implements PartitionPage, IntArray {
final IntBuffer intBuffer;
final ByteBuffer byteBuffer;
private IntArrayPage(ByteBuffer byteBuffer) {
this.byteBuffer = byteBuffer;
this.intBuffer = byteBuffer.asIntBuffer();
}
public static IntArrayPage onHeap(int size) {
return new IntArrayPage(ByteBuffer.allocateDirect(WORD_SIZE*size));
}
public static IntArrayPage fromMmapReadOnly(Path file, long offset, int size) throws IOException {
return new IntArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_ONLY, StandardOpenOption.READ));
}
public static IntArrayPage fromMmapReadWrite(Path file, long offset, int size) throws IOException {
return new IntArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_WRITE, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE));
}
private static ByteBuffer mmapFile(Path file, long offset, int size, FileChannel.MapMode mode, OpenOption... openOptions) throws IOException {
try (var channel = (FileChannel) Files.newByteChannel(file, openOptions)) {
return channel.map(mode, WORD_SIZE*offset, (long) size*WORD_SIZE);
}
catch (IOException ex) {
throw new IOException("Failed to map file " + file + " (" + offset + ":" + size + ")", ex);
}
}
@Override
public int get(long at) {
return intBuffer.get((int) at);
}
@Override
public void get(long start, long end, int[] buffer) {
intBuffer.get((int) start, buffer, 0, (int) (end - start));
}
@Override
public void set(long at, int val) {
intBuffer.put((int) at, val);
}
@Override
public void set(long start, long end, IntBuffer buffer, int bufferStart) {
intBuffer.put((int) start, buffer, bufferStart, (int) (end-start));
}
@Override
public long size() {
return intBuffer.capacity();
}
public void increment(int at) {
set(at, get(at) + 1);
}
@Override
public ByteBuffer getByteBuffer() {
return byteBuffer;
}
@Override
public void write(Path filename) throws IOException {
try (var channel = (FileChannel) Files.newByteChannel(filename, StandardOpenOption.WRITE, StandardOpenOption.CREATE)) {
write(channel);
}
}
@Override
public void force() {
if (byteBuffer instanceof MappedByteBuffer mb) {
mb.force();
}
}
@Override
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
int index = (int) (arrayStart * WORD_SIZE);
int length = (int) ((arrayEnd - arrayStart) * WORD_SIZE);
var slice = byteBuffer.slice(index, length);
long startPos = sourceStart * WORD_SIZE;
while (slice.position() < slice.capacity()) {
source.read(slice, startPos + slice.position());
}
}
@Override
public void advice(NativeIO.Advice advice) throws IOException {
NativeIO.madvise((MappedByteBuffer) byteBuffer, advice);
}
@Override
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
NativeIO.madviseRange((MappedByteBuffer) byteBuffer, advice, (int) start, (int) (end-start));
}
}

View File

@ -0,0 +1,135 @@
package nu.marginalia.util.array.page;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.array.trace.ArrayTrace;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class LongArrayPage implements PartitionPage, LongArray {
final ArrayTrace trace = ArrayTrace.get(this);
final LongBuffer longBuffer;
final ByteBuffer byteBuffer;
private LongArrayPage(ByteBuffer byteBuffer) {
this.byteBuffer = byteBuffer;
this.longBuffer = byteBuffer.asLongBuffer();
}
public static LongArrayPage onHeap(int size) {
return new LongArrayPage(ByteBuffer.allocateDirect(WORD_SIZE*size));
}
public static LongArrayPage fromMmapReadOnly(Path file, long offset, int size) throws IOException {
return new LongArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_ONLY, StandardOpenOption.READ));
}
public static LongArrayPage fromMmapReadWrite(Path file, long offset, int size) throws IOException {
return new LongArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_WRITE, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE));
}
private static ByteBuffer mmapFile(Path file, long offset, int size, FileChannel.MapMode mode, OpenOption... openOptions) throws IOException {
try (var channel = (FileChannel) Files.newByteChannel(file, openOptions)) {
return channel.map(mode, WORD_SIZE*offset, (long) size*WORD_SIZE);
}
catch (IOException ex) {
throw new IOException("Failed to map file " + file + " (" + offset + ":" + size + ")", ex);
}
}
@Override
public long get(long at) {
try {
trace.touch(at);
return longBuffer.get((int) at);
}
catch (IndexOutOfBoundsException ex) {
throw new IndexOutOfBoundsException("@" + at + "(" + 0 + ":" + longBuffer.capacity() + ")");
}
}
@Override
public void get(long start, long end, long[] buffer) {
trace.touch(start, end);
longBuffer.get((int) start, buffer, 0, (int) (end - start));
}
@Override
public void set(long at, long val) {
trace.touch(at);
longBuffer.put((int) at, val);
}
@Override
public void set(long start, long end, LongBuffer buffer, int bufferStart) {
longBuffer.put((int) start, buffer, bufferStart, (int) (end-start));
}
@Override
public long size() {
return longBuffer.capacity();
}
public void increment(int at) {
set(at, get(at) + 1);
}
@Override
public ByteBuffer getByteBuffer() {
return byteBuffer;
}
@Override
public void write(Path filename) throws IOException {
try (var channel = (FileChannel) Files.newByteChannel(filename, StandardOpenOption.WRITE, StandardOpenOption.CREATE)) {
write(channel);
}
}
@Override
public void force() {
if (byteBuffer instanceof MappedByteBuffer mb) {
mb.force();
}
}
@Override
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
trace.touch(arrayStart, arrayEnd);
int index = (int) (arrayStart * WORD_SIZE);
int length = (int) ((arrayEnd - arrayStart) * WORD_SIZE);
var slice = byteBuffer.slice(index, length);
long startPos = sourceStart * WORD_SIZE;
while (slice.position() < slice.capacity()) {
source.read(slice, startPos + slice.position());
}
}
@Override
public void advice(NativeIO.Advice advice) throws IOException {
NativeIO.madvise((MappedByteBuffer) byteBuffer, advice);
}
@Override
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
NativeIO.madviseRange((MappedByteBuffer) byteBuffer, advice, (int) start, (int) (end-start));
}
}

View File

@ -0,0 +1,330 @@
package nu.marginalia.util.array.page;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.IntArray;
import nu.marginalia.util.array.buffer.IntQueryBuffer;
import nu.marginalia.util.array.delegate.ReferenceImplIntArrayDelegate;
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
import nu.marginalia.util.array.functional.IntIOTransformer;
import nu.marginalia.util.array.functional.IntTransformer;
import nu.marginalia.util.array.functional.LongIntConsumer;
import nu.marginalia.util.array.functor.IntIOFolder;
import nu.marginalia.util.array.scheme.ArrayPartitioningScheme;
import java.io.IOException;
import java.nio.IntBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class PagingIntArray extends AbstractPagingArray<IntArrayPage, IntBuffer> implements IntArray {
private final ReferenceImplIntArrayDelegate defaults;
private PagingIntArray(ArrayPartitioningScheme partitioningScheme,
IntArrayPage[] pages,
long size) {
super(partitioningScheme, pages, size);
defaults = new ReferenceImplIntArrayDelegate(this);
}
public static IntArray newOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) {
if (cardinality < MAX_CONTINUOUS_SIZE) {
return IntArrayPage.onHeap((int) cardinality);
}
return newPartitionedOnHeap(partitioningScheme, cardinality);
}
public static IntArray newPartitionedOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) {
IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(cardinality)];
for (int i = 0; i < pages.length; i++) {
pages[i] = IntArrayPage.onHeap(partitioningScheme.getRequiredPageSize(i, cardinality));
}
return new PagingIntArray(partitioningScheme, pages, cardinality);
}
public static PagingIntArray mapFileReadOnly(ArrayPartitioningScheme partitioningScheme, Path file)
throws IOException
{
long sizeBytes = Files.size(file);
assert sizeBytes % WORD_SIZE == 0;
long size = sizeBytes / WORD_SIZE;
IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(size)];
long offset = 0;
for (int i = 0; i < pages.length; i++) {
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
pages[i] = IntArrayPage.fromMmapReadOnly(file, offset, partitionSize);
offset += partitionSize;
}
return new PagingIntArray(partitioningScheme, pages, size);
}
public static PagingIntArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file)
throws IOException
{
long sizeBytes = Files.size(file);
assert sizeBytes % LongArrayPage.WORD_SIZE == 0;
long size = sizeBytes / LongArrayPage.WORD_SIZE;
IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(size)];
long offset = 0;
for (int i = 0; i < pages.length; i++) {
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
pages[i] = IntArrayPage.fromMmapReadWrite(file, offset, partitionSize);
offset += partitionSize;
}
return new PagingIntArray(partitioningScheme, pages, size);
}
public static PagingIntArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file, long size)
throws IOException
{
IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(size)];
long offset = 0;
for (int i = 0; i < pages.length; i++) {
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
pages[i] = IntArrayPage.fromMmapReadWrite(file, offset, partitionSize);
offset += partitionSize;
}
return new PagingIntArray(partitioningScheme, pages, size);
}
public int get(long pos) {
int page = partitioningScheme.getPage(pos);
int offset = partitioningScheme.getOffset(pos);
try {
return pages[page].get(partitioningScheme.getOffset(pos));
}
catch (IndexOutOfBoundsException ex) {
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
}
}
@Override
public void get(long start, long end, int[] buffer) {
if (partitioningScheme.isSamePage(start, end)) {
int sOff = partitioningScheme.getOffset(start);
int eOff = partitioningScheme.getEndOffset(start, end);
pages[partitioningScheme.getPage(start)].get(sOff, eOff, buffer);
}
else {
defaults.get(start, end, buffer);
}
}
@Override
public void set(long pos, int value) {
int page = partitioningScheme.getPage(pos);
int offset = partitioningScheme.getOffset(pos);
try {
pages[page].set(offset, value);
}
catch (IndexOutOfBoundsException ex) {
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
}
}
@Override
public long size() {
return size;
}
@Override
public void increment(long pos) {
int page = partitioningScheme.getPage(pos);
int offset = partitioningScheme.getOffset(pos);
try {
pages[page].increment(partitioningScheme.getOffset(pos));
}
catch (IndexOutOfBoundsException ex) {
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
}
}
@Override
public void forEach(long start, long end, LongIntConsumer consumer) {
delegateToEachPage(start, end, (page, s, e) -> page.forEach(s, e, consumer));
}
@Override
public void fill(long fromIndex, long toIndex, int value) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
pages[partitioningScheme.getPage(fromIndex)].fill(sOff, eOff, value);
}
else if (toIndex >= fromIndex) {
delegateToEachPage(fromIndex, toIndex, (page, s, e) -> page.fill(s, e, value));
}
}
@Override
public void transformEach(long start, long end, IntTransformer transformer) {
delegateToEachPage(start, end, (page, s, e) -> page.transformEach(s, e, transformer));
}
@Override
public void transformEachIO(long start, long end, IntIOTransformer transformer) throws IOException {
delegateToEachPageIO(start, end, (page, s, e) -> page.transformEachIO(s, e, transformer));
}
@Override
public int foldIO(int zero, long start, long end, IntBinaryIOOperation operator) throws IOException {
var folder = new IntIOFolder(zero, operator);
delegateToEachPageIO(start, end, folder);
return folder.acc;
}
@Override
public long linearSearch(int key, long fromIndex, long toIndex) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearch(key, sOff, eOff);
return translateSearchResultsFromPage(fromIndex, ret);
}
else {
return defaults.linearSearch(key, fromIndex, toIndex);
}
}
@Override
public long binarySearch(int key, long fromIndex, long toIndex) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearch(key, sOff, eOff);
return translateSearchResultsFromPage(fromIndex, ret);
}
else {
return defaults.binarySearch(key, fromIndex, toIndex);
}
}
@Override
public long binarySearchUpperBound(int key, long fromIndex, long toIndex) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchUpperBound(key, sOff, eOff);
return translateSearchResultsFromPage(fromIndex, ret);
}
else {
return defaults.binarySearchUpperBound(key, fromIndex, toIndex);
}
}
@Override
public void retain(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
int sOff = partitioningScheme.getOffset(searchStart);
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
if (eOff > sOff) {
pages[partitioningScheme.getPage(searchStart)].retain(buffer, boundary, sOff, eOff);
}
}
else {
defaults.retain(buffer, boundary, searchStart, searchEnd);
}
}
@Override
public void reject(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
int sOff = partitioningScheme.getOffset(searchStart);
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
if (eOff > sOff) {
pages[partitioningScheme.getPage(searchStart)].reject(buffer, boundary, sOff, eOff);
}
}
else {
defaults.reject(buffer, boundary, searchStart, searchEnd);
}
}
public void write(Path fileName) throws IOException {
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
for (int i = 0; i < pages.length; i++) {
pages[i].write(channel);
}
channel.force(false);
}
}
public long getSize() {
if (size < 0) {
throw new UnsupportedOperationException();
}
return size;
}
@Override
public void force() {
for (var page : pages) {
page.force();
}
}
@Override
public void advice(NativeIO.Advice advice) throws IOException {
for (var page : pages) {
page.advice(advice);
}
}
@Override
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
delegateToEachPageIO(start, end, (a,s,e) -> a.advice(advice, s, e));
}
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
assert arrayEnd >= arrayStart;
int page = partitioningScheme.getPage(arrayStart);
long endPos;
for (long pos = arrayStart; pos < arrayEnd; pos = endPos) {
endPos = partitioningScheme.getPageEnd(pos, arrayEnd);
int sOff = partitioningScheme.getOffset(pos);
int eOff = partitioningScheme.getEndOffset(pos, endPos);
pages[page++].transferFrom(source, sourceStart, sOff, eOff);
sourceStart+=(endPos - pos);
}
}
}

View File

@ -0,0 +1,498 @@
package nu.marginalia.util.array.page;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.array.delegate.ReferenceImplLongArrayDelegate;
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
import nu.marginalia.util.array.functional.LongIOTransformer;
import nu.marginalia.util.array.functional.LongLongConsumer;
import nu.marginalia.util.array.functional.LongTransformer;
import nu.marginalia.util.array.functor.LongIOFolder;
import nu.marginalia.util.array.scheme.ArrayPartitioningScheme;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class PagingLongArray extends AbstractPagingArray<LongArrayPage, LongBuffer> implements LongArray {
private final ReferenceImplLongArrayDelegate defaults;
private PagingLongArray(ArrayPartitioningScheme partitioningScheme, LongArrayPage[] pages, long size) {
super(partitioningScheme, pages, size);
defaults = new ReferenceImplLongArrayDelegate(this);
}
public static LongArray newOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) {
return newPartitionedOnHeap(partitioningScheme, cardinality);
}
public static LongArray newPartitionedOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) {
LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(cardinality)];
for (int i = 0; i < pages.length; i++) {
pages[i] = LongArrayPage.onHeap(partitioningScheme.getRequiredPageSize(i, cardinality));
}
return new PagingLongArray(partitioningScheme, pages, cardinality);
}
public static PagingLongArray mapFileReadOnly(ArrayPartitioningScheme partitioningScheme, Path file)
throws IOException
{
long sizeBytes = Files.size(file);
assert sizeBytes % WORD_SIZE == 0;
long size = sizeBytes / WORD_SIZE;
LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(size)];
long offset = 0;
for (int i = 0; i < pages.length; i++) {
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
pages[i] = LongArrayPage.fromMmapReadOnly(file, offset, partitionSize);
offset += partitionSize;
}
return new PagingLongArray(partitioningScheme, pages, size);
}
public static PagingLongArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file)
throws IOException
{
long sizeBytes = Files.size(file);
assert sizeBytes % WORD_SIZE == 0;
long size = sizeBytes / WORD_SIZE;
LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(size)];
long offset = 0;
for (int i = 0; i < pages.length; i++) {
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
pages[i] = LongArrayPage.fromMmapReadWrite(file, offset, partitionSize);
offset += partitionSize;
}
return new PagingLongArray(partitioningScheme, pages, size);
}
public static PagingLongArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file, long size)
throws IOException
{
LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(size)];
long offset = 0;
for (int i = 0; i < pages.length; i++) {
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
pages[i] = LongArrayPage.fromMmapReadWrite(file, offset, partitionSize);
offset += partitionSize;
}
return new PagingLongArray(partitioningScheme, pages, size);
}
@Override
public long get(long pos) {
int page = partitioningScheme.getPage(pos);
int offset = partitioningScheme.getOffset(pos);
try {
return pages[page].get(partitioningScheme.getOffset(pos));
}
catch (IndexOutOfBoundsException ex) {
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
}
}
@Override
public void get(long start, long end, long[] buffer) {
if (partitioningScheme.isSamePage(start, end)) {
int sOff = partitioningScheme.getOffset(start);
int eOff = partitioningScheme.getEndOffset(start, end);
pages[partitioningScheme.getPage(start)].get(sOff, eOff, buffer);
}
else {
defaults.get(start, end, buffer);
}
}
@Override
public void set(long pos, long value) {
int page = partitioningScheme.getPage(pos);
int offset = partitioningScheme.getOffset(pos);
try {
pages[page].set(offset, value);
}
catch (IndexOutOfBoundsException ex) {
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
}
}
@Override
public long size() {
return size;
}
@Override
public void increment(long pos) {
int page = partitioningScheme.getPage(pos);
int offset = partitioningScheme.getOffset(pos);
try {
pages[page].increment(partitioningScheme.getOffset(pos));
}
catch (IndexOutOfBoundsException ex) {
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
}
}
@Override
public void forEach(long start, long end, LongLongConsumer transformer) {
delegateToEachPage(start, end, (page, s, e) -> page.forEach(s, e, transformer));
}
@Override
public void fill(long fromIndex, long toIndex, long value) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
pages[partitioningScheme.getPage(fromIndex)].fill(sOff, eOff, value);
}
else {
delegateToEachPage(fromIndex, toIndex, (page, s, e) -> page.fill(s, e, value));
}
}
@Override
public void transformEach(long start, long end, LongTransformer transformer) {
delegateToEachPage(start, end, (page, s, e) -> page.transformEach(s, e, transformer));
}
@Override
public void transformEachIO(long start, long end, LongIOTransformer transformer) throws IOException {
delegateToEachPageIO(start, end, (page, s, e) -> page.transformEachIO(s, e, transformer));
}
@Override
public long foldIO(long zero, long start, long end, LongBinaryIOOperation operator) throws IOException {
var folder = new LongIOFolder(zero, operator);
delegateToEachPageIO(start, end, folder);
return folder.acc;
}
@Override
public long linearSearch(long key, long fromIndex, long toIndex) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearch(key, sOff, eOff);
return translateSearchResultsFromPage(fromIndex, ret);
}
else {
return defaults.linearSearch(key, fromIndex, toIndex);
}
}
@Override
public long linearSearchN(int sz, long key, long fromIndex, long toIndex) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearchN(sz, key, sOff, eOff);
return translateSearchResultsFromPage(fromIndex, ret);
}
else {
return defaults.linearSearchN(sz, key, fromIndex, toIndex);
}
}
@Override
public long binarySearch(long key, long fromIndex, long toIndex) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearch(key, sOff, eOff);
return translateSearchResultsFromPage(fromIndex, ret);
}
else {
return defaults.binarySearch(key, fromIndex, toIndex);
}
}
@Override
public long binarySearchN(int sz, long key, long fromIndex, long toIndex) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchN(sz, key, sOff, eOff);
return translateSearchResultsFromPage(fromIndex, ret);
}
else {
return defaults.binarySearchN(sz, key, fromIndex, toIndex);
}
}
@Override
public long binarySearchUpperBound(long key, long fromIndex, long toIndex) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchUpperBound(key, sOff, eOff);
return translateSearchResultsFromPage(fromIndex, ret);
}
else {
return defaults.binarySearchUpperBound(key, fromIndex, toIndex);
}
}
@Override
public long linearSearchUpperBound(long key, long fromIndex, long toIndex) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearchUpperBound(key, sOff, eOff);
return translateSearchResultsFromPage(fromIndex, ret);
}
else {
return defaults.linearSearchUpperBound(key, fromIndex, toIndex);
}
}
@Override
public long binarySearchUpperBoundN(int sz, long key, long fromIndex, long toIndex) {
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
int sOff = partitioningScheme.getOffset(fromIndex);
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchUpperBoundN(sz, key, sOff, eOff);
return translateSearchResultsFromPage(fromIndex, ret);
}
else {
return defaults.binarySearchUpperBoundN(sz, key, fromIndex, toIndex);
}
}
@Override
public void retain(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
int sOff = partitioningScheme.getOffset(searchStart);
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
if (eOff > sOff) {
pages[partitioningScheme.getPage(searchStart)].retain(buffer, boundary, sOff, eOff);
}
}
else {
defaults.retain(buffer, boundary, searchStart, searchEnd);
}
}
@Override
public void retainN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
int sOff = partitioningScheme.getOffset(searchStart);
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
if (eOff > sOff) {
pages[partitioningScheme.getPage(searchStart)].retainN(buffer, sz, boundary, sOff, eOff);
}
}
else {
defaults.retainN(buffer, sz, boundary, searchStart, searchEnd);
}
}
@Override
public void reject(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
int sOff = partitioningScheme.getOffset(searchStart);
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
if (eOff > sOff) {
pages[partitioningScheme.getPage(searchStart)].reject(buffer, boundary, sOff, eOff);
}
}
else {
defaults.reject(buffer, boundary, searchStart, searchEnd);
}
}
@Override
public void rejectN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
int sOff = partitioningScheme.getOffset(searchStart);
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
if (eOff > sOff) {
pages[partitioningScheme.getPage(searchStart)].rejectN(buffer, sz, boundary, sOff, eOff);
}
}
else {
defaults.rejectN(buffer, sz, boundary, searchStart, searchEnd);
}
}
@Override
public void insertionSort(long start, long end) {
if (partitioningScheme.isSamePage(start, end)) {
int sOff = partitioningScheme.getOffset(start);
int eOff = partitioningScheme.getEndOffset(start, end);
if (eOff > sOff) {
pages[partitioningScheme.getPage(start)].insertionSort(sOff, eOff);
}
}
else {
defaults.insertionSort(start, end);
}
}
@Override
public void insertionSortN(int sz, long start, long end) {
if (partitioningScheme.isSamePage(start, end)) {
int sOff = partitioningScheme.getOffset(start);
int eOff = partitioningScheme.getEndOffset(start, end);
if (eOff > sOff) {
pages[partitioningScheme.getPage(start)].insertionSortN(sz, sOff, eOff);
}
}
else {
defaults.insertionSortN(sz, start, end);
}
}
@Override
public void quickSort(long start, long end) {
if (partitioningScheme.isSamePage(start, end)) {
int sOff = partitioningScheme.getOffset(start);
int eOff = partitioningScheme.getEndOffset(start, end);
if (eOff > sOff) {
pages[partitioningScheme.getPage(start)].quickSort(sOff, eOff);
}
}
else {
defaults.quickSort(start, end);
}
}
@Override
public void quickSortN(int sz, long start, long end) {
if (partitioningScheme.isSamePage(start, end)) {
int sOff = partitioningScheme.getOffset(start);
int eOff = partitioningScheme.getEndOffset(start, end);
if (eOff > sOff) {
pages[partitioningScheme.getPage(start)].quickSortN(sz, sOff, eOff);
}
}
else {
defaults.quickSortN(sz, start, end);
}
}
@Override
public void mergeSort(long start, long end, Path tempDir) throws IOException {
if (partitioningScheme.isSamePage(start, end)) {
int sOff = partitioningScheme.getOffset(start);
int eOff = partitioningScheme.getEndOffset(start, end);
if (eOff > sOff) {
pages[partitioningScheme.getPage(start)].mergeSort(sOff, eOff, tempDir);
}
}
else {
defaults.mergeSort(start, end, tempDir);
}
}
@Override
public void mergeSortN(int sz, long start, long end, Path tempDir) throws IOException {
if (partitioningScheme.isSamePage(start, end)) {
int sOff = partitioningScheme.getOffset(start);
int eOff = partitioningScheme.getEndOffset(start, end);
if (eOff > sOff) {
pages[partitioningScheme.getPage(start)].mergeSortN(sz, sOff, eOff, tempDir);
}
}
else {
defaults.mergeSortN(sz, start, end, tempDir);
}
}
public void write(Path fileName) throws IOException {
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
for (int i = 0; i < pages.length; i++) {
pages[i].write(channel);
}
channel.force(false);
}
}
public long getSize() {
if (size < 0) {
throw new UnsupportedOperationException();
}
return size;
}
@Override
public void force() {
for (var page : pages) {
page.force();
}
}
@Override
public void advice(NativeIO.Advice advice) throws IOException {
for (var page : pages) {
page.advice(advice);
}
}
@Override
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
delegateToEachPageIO(start, end, (a,s,e) -> a.advice(advice, s, e));
}
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
assert arrayEnd >= arrayStart;
int page = partitioningScheme.getPage(arrayStart);
long endPos;
for (long pos = arrayStart; pos < arrayEnd; pos = endPos) {
endPos = partitioningScheme.getPageEnd(pos, arrayEnd);
int sOff = partitioningScheme.getOffset(pos);
int eOff = partitioningScheme.getEndOffset(pos, endPos);
pages[page++].transferFrom(source, sourceStart, sOff, eOff);
sourceStart+=(endPos - pos);
}
}
}

View File

@ -0,0 +1,22 @@
package nu.marginalia.util.array.page;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
public interface PartitionPage {
default void write(FileChannel channel) throws IOException {
var byteBuffer = getByteBuffer();
byteBuffer.clear();
while (byteBuffer.position() < byteBuffer.limit()) {
channel.write(byteBuffer);
}
byteBuffer.clear();
}
ByteBuffer getByteBuffer();
}

View File

@ -0,0 +1,51 @@
package nu.marginalia.util.array.scheme;
public interface ArrayPartitioningScheme {
static ArrayPartitioningScheme forPartitionSize(int size) {
if (Integer.highestOneBit(size) == size) {
return new PowerOf2PartitioningScheme(size);
}
else {
return new SequentialPartitioningScheme(size);
}
}
static int getRequiredPartitions(long cardinality, int partitionSize) {
return (int) (cardinality / partitionSize + Long.signum(cardinality % partitionSize));
}
int getPartitions(long cardinality);
int getPage(long at);
boolean isSamePage(long a, long b);
/** Get the page offset corresponding to at */
int getOffset(long at);
/** Variant of getOffset that doesn't wrap around the page boundary, necessary when
* translating an exclusive end offset that getOffset(...) will translate to 0 and consider
* part of the next page.
*
* It is also necessary to consider the start offset to determine when the end offset
*
*/
default int getEndOffset(long start, long end) {
if (end == 0 || end <= start)
return getOffset(end);
return 1 + getOffset(end - 1);
}
/** Get the end of the buffer containing at, or endTotal, whichever is smaller
*/
long getPageEnd(long at, long endTotal);
/**
* toRealIndex(getBuffer(val), getOffset(val)) = val
*/
long toRealIndex(int buffer, int offset);
int getRequiredPageSize(int buffer, long cardinality);
}

View File

@ -0,0 +1,60 @@
package nu.marginalia.util.array.scheme;
public class PowerOf2PartitioningScheme implements ArrayPartitioningScheme {
final int partitionSize;
final long offsetMask;
final long bufferMask;
final int pageShift;
public PowerOf2PartitioningScheme(int partitionSize) {
assert partitionSize == Integer.highestOneBit(partitionSize);
this.partitionSize = partitionSize;
offsetMask = partitionSize - 1;
bufferMask = ~offsetMask;
pageShift = Integer.numberOfTrailingZeros(partitionSize);
}
@Override
public int getPartitions(long cardinality) {
return ArrayPartitioningScheme.getRequiredPartitions(cardinality, partitionSize);
}
@Override
public int getPage(long at) { // very hot code
return (int) (at >>> pageShift);
}
@Override
public int getOffset(long at) { // very hot code
return (int) (at & offsetMask);
}
@Override
public boolean isSamePage(long a, long b) { // hot code
return 0 == ((a ^ b) & bufferMask);
}
@Override
public long getPageEnd(long at, long endTotal) {
return Math.min(endTotal, partitionSize * (1L + getPage(at)));
}
@Override
public long toRealIndex(int buffer, int offset) {
return offset + (long) buffer * partitionSize;
}
@Override
public int getRequiredPageSize(int buffer, long cardinality) {
if ((long) (1 + buffer) * partitionSize <= cardinality) {
return partitionSize;
}
return (int) (cardinality % partitionSize);
}
}

View File

@ -0,0 +1,56 @@
package nu.marginalia.util.array.scheme;
public class SequentialPartitioningScheme implements ArrayPartitioningScheme {
final int partitionSize;
public SequentialPartitioningScheme(int partitionSize) {
this.partitionSize = partitionSize;
}
public static int getRequiredPartitions(long cardinality, int partitionSize) {
return (int) (cardinality / partitionSize + Long.signum(cardinality % partitionSize));
}
@Override
public int getPartitions(long cardinality) {
return getRequiredPartitions(cardinality, partitionSize);
}
@Override
public int getPage(long at) {
return (int) (at / partitionSize);
}
public long getPageEnd(long at, long endTotal) {
return Math.min(endTotal, partitionSize * (1L + getPage(at)));
}
@Override
public boolean isSamePage(long a, long b) {
return (int) (a / partitionSize) == (int)(b/partitionSize);
}
@Override
public int getOffset(long at) {
return (int) (at % partitionSize);
}
public long toRealIndex(int buffer, int offset) {
return offset + (long) buffer * partitionSize;
}
@Override
public int getRequiredPageSize(int buffer, long cardinality) {
if ((long) (1 + buffer) * partitionSize <= cardinality) {
return partitionSize;
}
return (int) (cardinality % partitionSize);
}
}

View File

@ -0,0 +1,22 @@
package nu.marginalia.util.array.trace;
import nu.marginalia.util.array.LongArray;
import java.nio.file.Path;
import java.util.Optional;
public interface ArrayTrace {
void touch(long address);
void touch(long start, long end);
FileTrace fileTrace = Optional.ofNullable(System.clearProperty("nu.marginalia.util.array.trace")).map(Path::of).map(FileTrace::new).orElseGet(FileTrace::new);
NullTrace nullTrace = new NullTrace();
static ArrayTrace get(LongArray array) {
if (fileTrace == null) {
return nullTrace;
}
return fileTrace.forArray(array);
}
}

View File

@ -0,0 +1,115 @@
package nu.marginalia.util.array.trace;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import static java.awt.image.BufferedImage.TYPE_INT_RGB;
public class ArrayTraceViz {
private static final int BLOCK_SIZE_WORDS = 512;
public static void main(String[] args) throws IOException {
Path inputFile = Path.of("/home/vlofgren/array-trace.log");
Map<Integer, Integer> sizes = new HashMap<>();
Map<Integer, Set<Integer>> rows = new HashMap<>();
try (var lines = Files.lines(inputFile)) {
lines.map(line -> line.split("\\s")).forEach(parts -> {
int block = Integer.parseInt(parts[1]);
int start = Integer.parseInt(parts[2]);
int end = Integer.parseInt(parts[3]);
sizes.merge(block, end, Integer::max);
var rowSet = rows.computeIfAbsent(block, b -> new HashSet<>());
for (int b = start; b < end; b += BLOCK_SIZE_WORDS) {
rowSet.add(b/BLOCK_SIZE_WORDS);
}
});
}
Map<Integer, Map<Integer, Integer>> rowToY = new HashMap<>();
rows.forEach((row, vals) -> {
var map = new HashMap<Integer, Integer>(vals.size());
rowToY.put(row, map);
var list = new ArrayList<>(vals);
list.stream().sorted().forEach(val -> map.put(val, map.size()));
});
Map<Integer, Integer> cols = new HashMap<>();
sizes.keySet().forEach(key -> cols.put(key, cols.size()));
int width = cols.size() * (BLOCK_SIZE_WORDS+4);
int height = 640;
var bi = new BufferedImage(width, height, TYPE_INT_RGB);
AtomicInteger iv = new AtomicInteger();
try (var lines = Files.lines(inputFile)) {
lines.forEach(line -> {
String[] parts = line.split("\\s");
long time = Long.parseLong(parts[0]);
int block = Integer.parseInt(parts[1]);
int start = Integer.parseInt(parts[2]);
int end = Integer.parseInt(parts[3]);
for (int p = start; p < end; p++) {
int x0 = (4+BLOCK_SIZE_WORDS) * cols.get(block);
int x = x0 + (p%BLOCK_SIZE_WORDS);
int y = rowToY.get(block).get(p/BLOCK_SIZE_WORDS);
if (y >= 640) {
continue;
}
if (0 == bi.getRGB(x, y)) {
for (int x2 = 0; x2 < BLOCK_SIZE_WORDS; x2++) {
if (0 == bi.getRGB(x0 + x2, y)) {
bi.setRGB(x0 + x2, y, 0xC0C0C0);
}
}
}
System.out.println(x + "," + y);
bi.setRGB(x, y, (int) (0xFFFFFFL));
}
try {
if ((iv.incrementAndGet() % 4) == 0) {
ImageIO.write(bi, "png", new File("/tmp/test" + (time * Long.signum(time)) + " .png"));
for (int x = 0; x < width; x++) {
for (int y = 0; y < height; y++) {
int val = bi.getRGB(x, y);
int nval = (val&0xFF) - 1;
if (nval > 64) {
bi.setRGB(x, y, nval | (nval<<8) | (nval << 16));
}
else if ((val&0xFFFFFF) != 0) {
bi.setRGB(x, y, 64);
}
}
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}
}
record ArrayPage(int id, int size) {}
}

View File

@ -0,0 +1,52 @@
package nu.marginalia.util.array.trace;
import nu.marginalia.util.array.LongArray;
import java.io.IOException;
import java.io.PrintStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class FileTrace {
PrintStream traceWriter;
static volatile boolean doTrace = false;
public FileTrace(Path file) {
try {
traceWriter = new PrintStream(Files.newOutputStream(file, StandardOpenOption.WRITE, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
} catch (IOException e) {
throw new IllegalStateException(e);
}
}
public FileTrace() {
this(Path.of("/tmp/array-trace.log"));
}
public static void setTrace(boolean val) {
doTrace = val;
}
public void trace(int source, long start, long end) {
if (doTrace) {
traceWriter.printf("%d %d %d %d\n", System.nanoTime(), source, start, end);
}
}
public ArrayTrace forArray(LongArray array) {
return new ArrayTrace() {
final int code = array.hashCode();
@Override
public void touch(long address) {
trace(code, address, address+1);
}
@Override
public void touch(long start, long end) {
trace(code, start, end);
}
};
}
}

View File

@ -0,0 +1,11 @@
package nu.marginalia.util.array.trace;
public class NullTrace implements ArrayTrace {
@Override
public void touch(long address) {}
@Override
public void touch(long start, long end) {}
}

View File

@ -1,8 +1,8 @@
package nu.marginalia.util.btree;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
/*
* End-of-page mark that's used as a sentinel to verify that
@ -12,14 +12,16 @@ import nu.marginalia.util.multimap.MultimapFileLongSlice;
*/
public class BTreeDogEar {
private MultimapFileLongSlice sentinelSlice;
private LongArray sentinelSlice;
public BTreeDogEar(BTreeContext ctx, BTreeHeader header, MultimapFileLongSlice base) {
public BTreeDogEar(BTreeContext ctx, BTreeHeader header, LongArray base) {
if (header.numEntries() > 3) {
sentinelSlice = base.atOffset((long) header.numEntries() * ctx.entrySize() - 3);
sentinelSlice.put(0, 4L);
sentinelSlice.put(1, 5L);
sentinelSlice.put(2, 1L);
sentinelSlice = base.range(
(long) header.numEntries() * ctx.entrySize() - 3,
(long) header.numEntries() * ctx.entrySize());
sentinelSlice.set(0, 4L);
sentinelSlice.set(1, 5L);
sentinelSlice.set(2, 1L);
}
}

View File

@ -1,43 +1,39 @@
package nu.marginalia.util.btree;
import it.unimi.dsi.fastutil.longs.LongLongImmutablePair;
import lombok.SneakyThrows;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.array.algo.LongArraySearch;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.array.delegate.ShiftedLongArray;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapSearcher;
import static java.lang.Math.min;
public class BTreeReader {
public class BTreeReader implements BTreeReaderIf {
private final LongArray index;
private final ShiftedLongArray data;
private final MultimapFileLong file;
public final BTreeContext ctx;
private final MultimapSearcher indexSearcher;
private final MultimapSearcher dataSearcher;
private final BTreeHeader header;
public BTreeReader(MultimapFileLong file, BTreeContext ctx, BTreeHeader header) {
this.file = file;
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
this.ctx = ctx;
this.header = header;
}
public BTreeReader(MultimapFileLong file, BTreeContext ctx, long offset) {
this.file = file;
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
private final long dataBlockEnd;
public BTreeReader(LongArray file, BTreeContext ctx, long offset) {
this.ctx = ctx;
this.header = createHeader(file, offset);
dataBlockEnd = (long) ctx.entrySize() * header.numEntries();
index = file.range(header.indexOffsetLongs(), header.dataOffsetLongs());
data = file.range(header.dataOffsetLongs(), header.dataOffsetLongs() + dataBlockEnd);
}
public static BTreeHeader createHeader(MultimapFileLong file, long fileOffset) {
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
public static BTreeHeader createHeader(LongArray file, long fileOffset) {
long[] parts = new long[3];
file.get(fileOffset, fileOffset+3, parts);
return new BTreeHeader(parts[0], parts[1], parts[2]);
}
public BTreeHeader getHeader() {
@ -49,7 +45,7 @@ public class BTreeReader {
}
@SneakyThrows
public void retainEntries(BTreeQueryBuffer buffer) {
public void retainEntries(LongQueryBuffer buffer) {
if (header.layers() == 0) {
BTreePointer pointer = new BTreePointer(header);
while (buffer.hasMore()) {
@ -60,7 +56,7 @@ public class BTreeReader {
}
@SneakyThrows
public void rejectEntries(BTreeQueryBuffer buffer) {
public void rejectEntries(LongQueryBuffer buffer) {
if (header.layers() == 0) {
BTreePointer pointer = new BTreePointer(header);
while (buffer.hasMore()) {
@ -70,13 +66,13 @@ public class BTreeReader {
rejectSingle(buffer);
}
private void retainSingle(BTreeQueryBuffer buffer) {
private void retainSingle(LongQueryBuffer buffer) {
BTreePointer pointer = new BTreePointer(header);
for (; buffer.hasMore(); pointer.resetToRoot()) {
long val = buffer.currentValue() & ctx.equalityMask();
long val = buffer.currentValue();
if (!pointer.walkToData(val)) {
buffer.rejectAndAdvance();
@ -87,12 +83,12 @@ public class BTreeReader {
}
}
private void rejectSingle(BTreeQueryBuffer buffer) {
private void rejectSingle(LongQueryBuffer buffer) {
BTreePointer pointer = new BTreePointer(header);
for (; buffer.hasMore(); pointer.resetToRoot()) {
long val = buffer.currentValue() & ctx.equalityMask();
long val = buffer.currentValue();
if (pointer.walkToData(val) && pointer.containsData(val)) {
buffer.rejectAndAdvance();
@ -108,31 +104,53 @@ public class BTreeReader {
*
* @return file offset of entry matching keyRaw, negative if absent
*/
public long findEntry(final long keyRaw) {
final long key = keyRaw & ctx.equalityMask();
public long findEntry(final long key) {
BTreePointer ip = new BTreePointer(header);
while (!ip.isDataLayer())
ip.walkToChild(key);
if (!ip.walkToChild(key))
return -1;
return ip.findData(key);
}
public void readData(long[] data, int n, long pos) {
file.read(data, n, header.dataOffsetLongs() + pos);
public void readData(long[] buf, int n, long pos) {
data.get(pos, pos + n, buf);
}
public long[] queryData(long[] urls, int offset) {
public long[] queryData(long[] keys, int offset) {
BTreePointer pointer = new BTreePointer(header);
long[] ret = new long[urls.length];
long[] ret = new long[keys.length];
for (int i = 0; i < urls.length; i++, pointer.resetToRoot()) {
if (pointer.walkToData(urls[i])) {
long dataAddress = pointer.findData(urls[i]);
// this function could be re-written like retain() and would be
// much faster
if (header.layers() == 0) {
long searchStart = 0;
for (int i = 0; i < keys.length; i++) {
long key = keys[i];
searchStart = data.binarySearchN(ctx.entrySize(), key, searchStart, data.size);
if (searchStart < 0) {
searchStart = LongArraySearch.decodeSearchMiss(searchStart);
}
else {
ret[i] = data.get(searchStart + offset);
}
}
}
else {
for (int i = 0; i < keys.length; i++) {
if (i > 0) {
pointer.resetToRoot();
}
if (pointer.walkToData(keys[i])) {
long dataAddress = pointer.findData(keys[i]);
if (dataAddress >= 0) {
ret[i] = file.get(dataAddress + offset);
ret[i] = data.get(dataAddress + offset);
}
}
}
}
@ -140,25 +158,6 @@ public class BTreeReader {
return ret;
}
/** Find the range of values so that prefixStart <= n < prefixNext */
public LongLongImmutablePair getRangeForPrefix(long prefixStart, long prefixNext) {
long lowerBoundStart = lowerBound(prefixStart);
long lowerBoundEnd = lowerBound(prefixNext);
return new LongLongImmutablePair(lowerBoundStart, lowerBoundEnd);
}
private long lowerBound(long key) {
key &= ctx.equalityMask();
BTreePointer ip = new BTreePointer(header);
while (!ip.isDataLayer())
ip.walkToChild(key);
return ip.findDataLower(key);
}
private class BTreePointer {
private final long[] layerOffsets;
@ -190,18 +189,13 @@ public class BTreeReader {
}
public boolean walkToChild(long key) {
final long indexAddress = header.indexOffsetLongs();
final long indexLayerBlockOffset = layerOffsets[layer] + offset;
final long searchStart = layerOffsets[layer] + offset;
final long searchStart = indexAddress + indexLayerBlockOffset;
final long nextLayerOffset = (int)(indexSearcher.binarySearchLower(key, searchStart, ctx.BLOCK_SIZE_WORDS()) - searchStart);
if (nextLayerOffset < 0)
return false;
final long nextLayerOffset = (int) index.binarySearchUpperBound(key, searchStart, searchStart + ctx.BLOCK_SIZE_WORDS()) - searchStart;
layer --;
boundary = file.get(searchStart + offset);
boundary = index.get(searchStart + nextLayerOffset);
offset = ctx.BLOCK_SIZE_WORDS() * (offset + nextLayerOffset);
return true;
@ -225,41 +219,39 @@ public class BTreeReader {
}
public long findData(long key) {
if (layer > 0) {
if (layer >= 0) {
throw new IllegalStateException("Looking for data in an index layer");
}
long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
long searchStart = offset * ctx.entrySize();
long remainingTotal = dataBlockEnd - offset * ctx.entrySize();
long remainingBlock;
return dataSearcher.binarySearch(key, searchStart, numEntries);
remainingBlock = (layerOffsets.length == 0)
? remainingTotal
: (long) ctx.BLOCK_SIZE_WORDS() * ctx.entrySize();
long searchEnd = searchStart + (int) min(remainingTotal, remainingBlock);
return data.binarySearchN(ctx.entrySize(), key, searchStart, searchEnd);
}
public long findDataLower(long key) {
if (layer > 0) {
throw new IllegalStateException("Looking for data in an index layer");
}
long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
return dataSearcher.binarySearchLower(key, searchStart, numEntries);
}
public void retainData(BTreeQueryBuffer buffer) {
public void retainData(LongQueryBuffer buffer) {
long dataOffset = findData(buffer.currentValue());
if (dataOffset >= 0) {
buffer.retainAndAdvance();
long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
if (buffer.hasMore() && buffer.currentValue() <= boundary) {
long blockBase = offset * ctx.entrySize();
long relOffset = dataOffset - blockBase;
int numEntries =
min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
long remainingTotal = dataBlockEnd - dataOffset;
long remainingBlock = ctx.BLOCK_SIZE_WORDS() - relOffset;
if (buffer.currentValue() <= boundary) {
file.retain(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
long searchEnd = dataOffset + (int) min(remainingTotal, remainingBlock);
data.range(dataOffset, searchEnd).retainN(buffer, ctx.entrySize(), boundary);
}
}
else {
@ -268,20 +260,22 @@ public class BTreeReader {
}
public void rejectData(BTreeQueryBuffer buffer) {
public void rejectData(LongQueryBuffer buffer) {
long dataOffset = findData(buffer.currentValue());
if (dataOffset >= 0) {
buffer.rejectAndAdvance();
long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
if (buffer.hasMore() && buffer.currentValue() <= boundary) {
long blockBase = offset * ctx.entrySize();
long relOffset = dataOffset - blockBase;
int numEntries =
min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
long remainingTotal = dataBlockEnd - dataOffset;
long remainingBlock = ctx.BLOCK_SIZE_WORDS() - relOffset;
if (buffer.currentValue() <= boundary) {
file.reject(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
long searchEnd = dataOffset + (int) min(remainingTotal, remainingBlock);
data.range(dataOffset, searchEnd).rejectN(buffer, ctx.entrySize(), boundary);
}
}
else {

View File

@ -0,0 +1,21 @@
package nu.marginalia.util.btree;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.btree.model.BTreeHeader;
public interface BTreeReaderIf {
BTreeHeader getHeader();
int numEntries();
void retainEntries(LongQueryBuffer buffer);
void rejectEntries(LongQueryBuffer buffer);
long findEntry(long keyRaw);
void readData(long[] data, int n, long pos);
long[] queryData(long[] urls, int offset);
}

View File

@ -1,8 +1,8 @@
package nu.marginalia.util.btree;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -11,10 +11,10 @@ import java.io.IOException;
public class BTreeWriter {
private final BTreeContext ctx;
private final MultimapFileLongSlice map;
private final LongArray map;
private final Logger logger = LoggerFactory.getLogger(getClass());
public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
public BTreeWriter(LongArray map, BTreeContext ctx) {
this.map = map;
this.ctx = ctx;
}
@ -42,8 +42,10 @@ public class BTreeWriter {
header.write(map, offset);
final long startRange = header.dataOffsetLongs();
final long endRange = startRange + (long) numEntries * ctx.entrySize();
var slice = map.atOffset(header.dataOffsetLongs());
var slice = map.range(startRange, endRange);
BTreeDogEar dogEar = new BTreeDogEar(ctx, header, slice);
@ -53,13 +55,11 @@ public class BTreeWriter {
logger.error("Dog ear was not overwritten: {}", header);
}
if (header.layers() < 1) { // The data is too small to benefit from indexing
return ctx.calculateSize(numEntries);
}
else {
if (header.layers() >= 1) { // Omit layer if data fits within a single block
writeIndex(header);
return ctx.calculateSize(numEntries);
}
return ctx.calculateSize(numEntries);
}
public static BTreeHeader makeHeader(BTreeContext ctx, long offset, int numEntries) {
@ -96,7 +96,8 @@ public class BTreeWriter {
}
private void writeIndexLayer(BTreeHeader header, long[] layerOffsets,
private void writeIndexLayer(BTreeHeader header,
long[] layerOffsets,
final long indexedDataStepSize,
final int layer) {
@ -115,13 +116,20 @@ public class BTreeWriter {
dataPtr += indexedDataStepSize)
{
long dataOffset = dataOffsetBase + (dataPtr + lastDataEntryOffset) * entrySize;
map.put(indexOffsetBase + indexWord++, map.get(dataOffset) & ctx.equalityMask());
map.set(indexOffsetBase + indexWord++, map.get(dataOffset));
}
// Fill the remaining block with LONG_MAX
map.setRange(indexOffsetBase+indexWord,
(int) (ctx.BLOCK_SIZE_WORDS() - (indexWord % ctx.BLOCK_SIZE_WORDS())),
Long.MAX_VALUE);
// If the index block is not completely filled with data,
// top up the remaining index block with LONG_MAX
final long trailerStart = indexOffsetBase + indexWord;
final long trailerEnd = trailerStart
+ ctx.BLOCK_SIZE_WORDS()
- (int) (indexWord % ctx.BLOCK_SIZE_WORDS());
if (trailerStart < trailerEnd) {
map.fill(trailerStart, trailerEnd, Long.MAX_VALUE);
}
}

View File

@ -1,9 +1,9 @@
package nu.marginalia.util.btree;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
import nu.marginalia.util.array.LongArray;
import java.io.IOException;
public interface WriteCallback {
void write(MultimapFileLongSlice slice) throws IOException;
void write(LongArray slice) throws IOException;
}

View File

@ -4,22 +4,28 @@ import nu.marginalia.util.btree.BTreeWriter;
public record BTreeContext(int MAX_LAYERS,
int entrySize,
long equalityMask,
int BLOCK_SIZE_BITS,
int BLOCK_SIZE_WORDS) {
public BTreeContext(int MAX_LAYERS, int entrySize, long equalityMask, int BLOCK_SIZE_BITS) {
this(MAX_LAYERS, entrySize, equalityMask, BLOCK_SIZE_BITS, 1 << BLOCK_SIZE_BITS);
// 8 pages is the breaking point where using a B-tree is actually advantageous
// over just binary searching in a sorted list. Above 8 pages, binary search will
// worst-case four page faults. A b-tree will incur three page faults up until
// ~100k-200k entries with typical configurations.
private static final int MIN_PAGES_FOR_BTREE = 8;
public BTreeContext(int MAX_LAYERS, int entrySize, int BLOCK_SIZE_BITS) {
this(MAX_LAYERS, entrySize, BLOCK_SIZE_BITS, 1 << BLOCK_SIZE_BITS);
}
public long calculateSize(int numEntries) {
var header = BTreeWriter.makeHeader(this, 0, numEntries);
return header.dataOffsetLongs() + (long)numEntries * entrySize;
return header.dataOffsetLongs() + (long) numEntries * entrySize + 4;
}
public int numIndexLayers(int numEntries) {
if (numEntries <= BLOCK_SIZE_WORDS*2/entrySize) {
if (numEntries <= BLOCK_SIZE_WORDS*MIN_PAGES_FOR_BTREE/entrySize) {
return 0;
}
for (int i = 1; i < MAX_LAYERS; i++) {
@ -37,12 +43,8 @@ public record BTreeContext(int MAX_LAYERS,
public long indexLayerSize(int numWords, int level) {
final long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1));
final long numBlocks = numWords / layerSize;
if (numWords % layerSize != 0) {
return BLOCK_SIZE_WORDS * (numBlocks + 1);
}
return BLOCK_SIZE_WORDS * numBlocks;
return BLOCK_SIZE_WORDS * (numWords / layerSize + Long.signum(numWords % layerSize));
}
}

View File

@ -1,6 +1,6 @@
package nu.marginalia.util.btree.model;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
import nu.marginalia.util.array.LongArray;
public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) {
public BTreeHeader {
@ -28,10 +28,10 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon
return padding;
}
public void write(MultimapFileLongSlice dest, long offset) {
dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL));
dest.put(offset+1, indexOffsetLongs);
dest.put(offset+2, dataOffsetLongs);
public void write(LongArray dest, long offset) {
dest.set(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL));
dest.set(offset+1, indexOffsetLongs);
dest.set(offset+2, dataOffsetLongs);
}

View File

@ -0,0 +1,6 @@
package nu.marginalia.util.gregex;
import java.util.function.Predicate;
public interface GuardedRegex extends Predicate<String> {
}

View File

@ -0,0 +1,62 @@
package nu.marginalia.util.gregex;
import org.intellij.lang.annotations.Language;
import java.util.regex.Pattern;
public class GuardedRegexFactory {
// Regular expressions are slow, even compiled ones. Guarding them with startsWith, or even contains
// is something like an order of magnitude faster. This matters a lot in hot code.
public static GuardedRegex startsWith(String prefix, @Language("RegExp") String regex) {
return new GuardedRegexStartsWith(prefix, regex);
}
public static GuardedRegex endsWith(String suffix, @Language("RegExp") String regex) {
return new GuardedRegexEndsWith(suffix, regex);
}
public static GuardedRegex contains(String substring, @Language("RegExp") String regex) {
return new GuardedRegexContains(substring, regex);
}
public static GuardedRegex minLength(int minLength, @Language("RegExp") String regex) {
return new GuardedRegexMinLength(minLength, regex);
}
private record GuardedRegexContains(String contains, Pattern pattern) implements GuardedRegex {
public GuardedRegexContains(String contains, String pattern) {
this(contains, Pattern.compile(pattern));
}
public boolean test(String s) {
return s.contains(contains) && pattern.matcher(s).find();
}
}
private record GuardedRegexMinLength(int minLength, Pattern pattern) implements GuardedRegex {
public GuardedRegexMinLength(int minLength, String pattern) {
this(minLength, Pattern.compile(pattern));
}
public boolean test(String s) {
return s.length() >= minLength && pattern.matcher(s).find();
}
}
private record GuardedRegexStartsWith(String start, Pattern pattern) implements GuardedRegex {
public GuardedRegexStartsWith(String start, String pattern) {
this(start, Pattern.compile(pattern));
}
public boolean test(String s) {
return s.startsWith(start) && pattern.matcher(s).find();
}
}
private record GuardedRegexEndsWith(String end, Pattern pattern) implements GuardedRegex {
public GuardedRegexEndsWith(String end, String pattern) {
this(end, Pattern.compile(pattern));
}
public boolean test(String s) {
return s.endsWith(end) && pattern.matcher(s).find();
}
}
}

View File

@ -1,188 +0,0 @@
package nu.marginalia.util.hash;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.PrimeUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static java.lang.Math.round;
/**
* Spiritually influenced by GNU Trove's hash maps
* LGPL 2.1
*/
public class LongPairHashMap {
private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class);
private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police
private final long hashTableSize;
private final MultimapFileLong data;
private final long maxProbeLength;
private int sz = 0;
private static final int HEADER_SIZE = 2;
private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) {
this.data = data;
this.hashTableSize = hashTableSize;
this.maxProbeLength = maxProbeLength;
}
public static LongPairHashMap createNew(MultimapFileLong data, long size) {
var tableSize = PrimeUtil.nextPrime(size, 1);
var ret = new LongPairHashMap(data, tableSize, tableSize/2);
data.put(0, MAGIC_WORD);
data.put(1, tableSize);
for (int i = 2; i < tableSize; i++) {
data.put(HEADER_SIZE + 2L*i, 0);
}
return ret;
}
public static LongPairHashMap loadExisting(MultimapFileLong data) {
long key = data.get(0);
if (key != MAGIC_WORD) {
logger.warn("LongPairHashMap lacks magic word, could this be garbage data?");
}
var hashTableSize = data.get(1);
var maxProbeLength = hashTableSize / 10;
return new LongPairHashMap(data, hashTableSize, maxProbeLength);
}
public int size() {
return sz;
}
private CellData getCell(long idx) {
long bufferIdx = 2*idx + HEADER_SIZE;
long a = data.get(bufferIdx);
long b = data.get(bufferIdx+1);
return new CellData(a, b);
}
private void setCell(long idx, CellData cell) {
long bufferIdx = 2*idx + HEADER_SIZE;
data.put(bufferIdx, cell.first);
data.put(bufferIdx+1, cell.second);
}
public CellData put(CellData data) {
long hash = longHash(data.getKey()) & 0x7FFF_FFFFL;
long idx = hash% hashTableSize;
if (!getCell(hash% hashTableSize).isSet()) {
return setValue(data, hash% hashTableSize);
}
return putRehash(data, idx, hash);
}
private CellData putRehash(CellData data, long idx, long hash) {
final long pStride = 1 + (hash % (hashTableSize - 2));
for (long j = 1; j < maxProbeLength; j++) {
idx = idx - pStride;
if (idx < 0) {
idx += hashTableSize;
}
final var val = getCell(idx);
if (!val.isSet()) {
return setValue(data, idx);
}
else if (val.getKey() == data.getKey()) {
logger.error("Double write?");
return val;
}
}
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%, key = " + data.getKey() + ",#"+hash);
}
private CellData setValue(CellData data, long cell) {
sz++;
setCell(cell, data);
return data;
}
public CellData get(int key) {
if (hashTableSize == 0) {
return new CellData(0, 0);
}
final long hash = longHash(key) & 0x7FFF_FFFFL;
var val = getCell(hash % hashTableSize);
if (!val.isSet()) {
return val;
}
else if (val.getKey() == key) {
return val;
}
return getRehash(key, hash % hashTableSize, hash);
}
private CellData getRehash(int key, long idx, long hash) {
final long pStride = 1 + (hash % (hashTableSize - 2));
for (long j = 1; j < maxProbeLength; j++) {
idx = idx - pStride;
if (idx < 0) {
idx += hashTableSize;
}
final var val = getCell(idx);
if (!val.isSet()) {
return val;
}
else if (val.getKey() == key) {
return val;
}
}
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%");
}
private long longHash(long x) {
return x;
}
@Getter @EqualsAndHashCode
public static class CellData {
final long first;
final long second;
public CellData(long key, long offset) {
first = key | 0x8000_0000_000_000L;
second = offset;
}
public long getKey() {
return first & ~0x8000_0000_000_000L;
}
public long getOffset() {
return second;
}
public boolean isSet() {
return first != 0 || second != 0L;
}
}
public void close() throws Exception {
data.close();
}
}

View File

@ -6,8 +6,6 @@ import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import javax.inject.Inject;
@ -33,7 +31,7 @@ public class DocumentKeywordExtractor {
}
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
public EdgePageWords extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
@ -47,15 +45,15 @@ public class DocumentKeywordExtractor {
List<String> artifacts = getArtifacts(documentLanguageData);
keywordMetadata.flagsTemplate().add(EdgePageWordFlags.Simple);
WordsBuilder wordsBuilder = new WordsBuilder();
return new EdgePageWordSet(
createWords(keywordMetadata, IndexBlock.Title, titleWords),
EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
);
createWords(wordsBuilder, keywordMetadata, titleWords, 0);
artifacts.forEach(wordsBuilder::addWithBlankMetadata);
return wordsBuilder.build();
}
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
public EdgePageWords extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
@ -72,26 +70,25 @@ public class DocumentKeywordExtractor {
List<String> artifacts = getArtifacts(documentLanguageData);
var wordSet = new EdgePageWordSet(
createWords(keywordMetadata, IndexBlock.Title, titleWords),
createWords(keywordMetadata, IndexBlock.Tfidf_High, wordsTfIdf),
createWords(keywordMetadata, IndexBlock.Subjects, subjects),
EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
);
WordsBuilder wordsBuilder = new WordsBuilder();
getSimpleWords(keywordMetadata, wordSet, documentLanguageData,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
createWords(wordsBuilder, keywordMetadata, titleWords, 0);
createWords(wordsBuilder, keywordMetadata, wordsTfIdf, EdgePageWordFlags.TfIdfHigh.asBit());
createWords(wordsBuilder, keywordMetadata, subjects, 0);
return wordSet;
getSimpleWords(wordsBuilder, keywordMetadata, documentLanguageData);
artifacts.forEach(wordsBuilder::addWithBlankMetadata);
return wordsBuilder.build();
}
public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
Map<String, Integer> ret = keywordMetadata.positionMask();
int posCtr = 0;
for (var sent : dld.titleSentences) {
int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
int posBit = 1;
for (var word : sent) {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
@ -101,9 +98,11 @@ public class DocumentKeywordExtractor {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
}
posCtr+=4;
int pos = 1;
int line = 0;
for (var sent : dld.sentences) {
int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
int posBit = (int)((1L << pos) & 0xFFFF_FFFFL);
for (var word : sent) {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
@ -113,7 +112,28 @@ public class DocumentKeywordExtractor {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
posCtr++;
if (pos < 4) pos ++;
else if (pos < 8) {
if (++line >= 2) {
pos++;
line = 0;
}
}
else if (pos < 24) {
if (++line >= 4) {
pos++;
line = 0;
}
}
else if (pos < 64) {
if (++line > 8) {
pos++;
line = 0;
}
}
else {
break;
}
}
}
@ -122,28 +142,20 @@ public class DocumentKeywordExtractor {
}
private void getSimpleWords(KeywordMetadata metadata, EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
private void getSimpleWords(WordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) {
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
int start = 0;
int lengthGoal = 32;
for (var sent : documentLanguageData.sentences) {
for (int blockIdx = 0; blockIdx < blocks.length && start < documentLanguageData.sentences.length; blockIdx++) {
IndexBlock block = blocks[blockIdx];
Set<EdgePageWords.Entry> words = new HashSet<>(lengthGoal+100);
int pos;
int length = 0;
for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) {
var sent = documentLanguageData.sentences[pos];
length += sent.length();
if (wordsBuilder.size() > 1500)
break;
for (var word : sent) {
if (!word.isStopWord()) {
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, word.stemmed())));
wordsBuilder.add(w, metadata.forWord(flagsTemplate, word.stemmed()));
}
}
}
@ -152,13 +164,10 @@ public class DocumentKeywordExtractor {
var rep = new WordRep(sent, names);
String w = AsciiFlattener.flattenUnicode(rep.word);
words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, rep.stemmed)));
wordsBuilder.add(w, metadata.forWord(flagsTemplate, rep.stemmed));
}
}
wordSet.append(block, words);
start = pos;
lengthGoal+=32;
}
}
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
@ -197,11 +206,11 @@ public class DocumentKeywordExtractor {
.collect(Collectors.toList());
}
public EdgePageWords createWords(KeywordMetadata metadata,
IndexBlock block,
Collection<WordRep> words) {
public void createWords(WordsBuilder wordsBuilder,
KeywordMetadata metadata,
Collection<WordRep> words,
long additionalMeta) {
Set<EdgePageWords.Entry> entries = new HashSet<>(words.size());
for (var word : words) {
String flatWord = AsciiFlattener.flattenUnicode(word.word);
@ -209,9 +218,31 @@ public class DocumentKeywordExtractor {
continue;
}
entries.add(new EdgePageWords.Entry(flatWord, metadata.forWord(metadata.flagsTemplate(), word.stemmed)));
wordsBuilder.add(flatWord, metadata.forWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
}
}
return new EdgePageWords(block, entries);
private static class WordsBuilder {
private final EdgePageWords words = new EdgePageWords(1600);
private final Set<String> seen = new HashSet<>(1600);
public void add(String word, long meta) {
if (seen.add(word)) {
words.add(word, meta);
}
}
public void addWithBlankMetadata(String word) {
if (seen.add(word)) {
words.addJustNoMeta(word);
}
}
public EdgePageWords build() {
return words;
}
public int size() {
return seen.size();
}
}
}

View File

@ -1,7 +1,7 @@
package nu.marginalia.util.language.processing;
import com.github.jknack.handlebars.internal.lang3.StringUtils;
import gnu.trove.map.hash.TObjectIntHashMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
@ -27,7 +27,7 @@ public class KeywordCounter {
}
public List<WordRep> countHisto(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
TObjectIntHashMap<String> counts = new TObjectIntHashMap<>(10_000, 0.7f);
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
@ -41,7 +41,8 @@ public class KeywordCounter {
var rep = new WordRep(sent, span);
counts.adjustOrPutValue(rep.stemmed, 1, 1);
counts.mergeInt(rep.stemmed, 1, Integer::sum);
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500));
if (instanceSet.size() < 250) {
instanceSet.add(rep);
@ -54,7 +55,8 @@ public class KeywordCounter {
int maxVal = maxValue(counts);
counts.forEachEntry((key, cnt) -> {
counts.forEach((key, cnt) -> {
int value = getTermValue(key, cnt, maxVal);
tfIdf.put(key, new WordFrequencyData(cnt, value));
@ -62,18 +64,18 @@ public class KeywordCounter {
if (cnt > 1 && value > 100) {
tfIdfHigh.addAll(instances.get(key));
}
return true;
});
return tfIdfHigh;
}
private int maxValue(TObjectIntHashMap<?> map) {
private int maxValue(Object2IntOpenHashMap<?> map) {
int maxC = 0;
for (int c : map.values()) {
maxC = max(c, maxC);
}
return maxC;
}

View File

@ -32,7 +32,9 @@ public class KeywordExtractor {
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
if (isProperNoun(i, sentence) && (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence)) && isProperNoun(i-2, sentence))
if (isProperNoun(i, sentence)
&& (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence))
&& isProperNoun(i-2, sentence))
spans.add(new WordSpan(i-2, i+1));
}
@ -42,113 +44,19 @@ public class KeywordExtractor {
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) {
if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence))
spans.add(new WordSpan(i-3, i+1));
}
else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT")) {
else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT"))
spans.add(new WordSpan(i-3, i+1));
}
else if ((isJoiner(sentence, i-1)||isProperNoun(i - 1, sentence)) && (isJoiner(sentence, i-2)||isProperNoun(i - 2, sentence))) {
else if ((isJoiner(sentence, i-1) ||isProperNoun(i-1, sentence))
&& (isJoiner(sentence, i-2)||isProperNoun(i-2, sentence)))
spans.add(new WordSpan(i-3, i+1));
}
}
}
return spans.toArray(WordSpan[]::new);
}
public WordSpan[] getNamesStrict(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>(sentence.length());
for (int i = 0; i < sentence.length(); i++) {
if (isProperNoun(i, sentence))
spans.add(new WordSpan(i, i+1));
}
for (int i = 1; i < sentence.length(); i++) {
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
if (isProperNoun(i, sentence) && isProperNoun(i-1, sentence))
spans.add(new WordSpan(i-1, i+1));
}
for (int i = 2; i < sentence.length(); i++) {
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
if (isProperNoun(i, sentence) && (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence)) && isProperNoun(i-2, sentence))
spans.add(new WordSpan(i-2, i+1));
}
for (int i = 3; i < sentence.length(); i++) {
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) {
spans.add(new WordSpan(i-3, i+1));
}
else if (isJoiner(sentence, i-1) && sentence.posTags[i-2].equals("DT")) {
spans.add(new WordSpan(i-3, i+1));
}
}
}
return spans.toArray(WordSpan[]::new);
}
public boolean isProperNoun(int i, DocumentSentence sent) {
return "NNP".equals(sent.posTags[i]) || "NNPS".equals(sent.posTags[i]);
}
public boolean isJoiner(DocumentSentence sent, int i) {
if(sent.posTags[i].equals("IN")) {
return true;
}
if (sent.posTags[i].equals("TO")) {
return true;
}
if (sent.posTags[i].equals("CC")) {
return sent.wordsLowerCase[i].equals("and");
}
return false;
}
public List<WordSpan> getWordsFromSentence(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>();
for (int k = 0; k < 4; k++) {
for (int i = k; i < sentence.length(); i++) {
var w = new WordSpan(i-k, i + 1);
if (isViableSpanForWord(sentence, w)) {
spans.add(w);
}
}
}
return spans;
}
private boolean isViableSpanForWord(DocumentSentence sentence, WordSpan w) {
for (int i = w.start; i < w.end-1; i++) {
if (sentence.separators[i] == WordSeparator.COMMA) {
return false;
}
}
String word = sentence.constructWordFromSpan(w);
if (word.isBlank() || !WordPatterns.filter(word)) return false;
if (sentence.posTags[w.start].equals("CC")) return false;
if (sentence.posTags[w.end-1].equals("IN")) return false;
if (sentence.posTags[w.end-1].equals("DT")) return false;
if (sentence.posTags[w.end-1].equals("CC")) return false;
if (sentence.posTags[w.end-1].equals("TO")) return false;
return true;
}
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
if (sentence.keywords != null) {
return sentence.keywords.get();
@ -223,63 +131,56 @@ public class KeywordExtractor {
return ret;
}
public WordSpan[] getKeywordsFromSentenceStrict(DocumentSentence sentence, Set<String> topWords, boolean reducePartials) {
List<WordSpan> spans = new ArrayList<>(sentence.length());
if (!reducePartials) {
for (int i = 0; i < sentence.length(); i++) {
if (topWords.contains(sentence.stemmedWords[i]))
spans.add(new WordSpan(i, i + 1));
}
public boolean isProperNoun(int i, DocumentSentence sent) {
return "NNP".equals(sent.posTags[i]) || "NNPS".equals(sent.posTags[i]);
}
for (int i = 1; i < sentence.length(); i++) {
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
if (topWords.contains(sentence.stemmedWords[i])
&& !sentence.words[i].endsWith("'s")
&& topWords.contains(sentence.stemmedWords[i-1])) {
spans.add(new WordSpan(i-1, i + 1));
public boolean isJoiner(DocumentSentence sent, int i) {
if(sent.posTags[i].equals("IN")) {
return true;
}
if (sent.posTags[i].equals("TO")) {
return true;
}
for (int i = 2; i < sentence.length(); i++) {
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
if (topWords.contains(sentence.stemmedWords[i])
&& !sentence.words[i].endsWith("'s")
&& (topWords.contains(sentence.stemmedWords[i-1]) || isJoiner(sentence, i-1))
&& topWords.contains(sentence.stemmedWords[i-2])
) {
spans.add(new WordSpan(i-2, i + 1));
if (sent.posTags[i].equals("CC")) {
return sent.wordsLowerCase[i].equals("and");
}
return false;
}
for (int i = 3; i < sentence.length(); i++) {
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
if (!sentence.words[i-2].endsWith("'s")) { continue; }
if (!sentence.words[i-3].endsWith("'s")) { continue; }
public List<WordSpan> getWordsFromSentence(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>();
if (topWords.contains(sentence.stemmedWords[i])
&& !sentence.words[i].endsWith("'s") && topWords.contains(sentence.stemmedWords[i-3])) {
if (topWords.contains(sentence.stemmedWords[i-1]) && topWords.contains(sentence.stemmedWords[i-2])) {
spans.add(new WordSpan(i-3, i + 1));
}
else if (topWords.contains(sentence.stemmedWords[i-1]) && isJoiner(sentence, i-2)) {
spans.add(new WordSpan(i-3, i + 1));
}
else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT")) {
spans.add(new WordSpan(i-3, i + 1));
}
else if (isJoiner(sentence, i-2) && isJoiner(sentence, i-1)) {
spans.add(new WordSpan(i-3, i + 1));
for (int k = 0; k < 4; k++) {
for (int i = k; i < sentence.length(); i++) {
var w = new WordSpan(i-k, i + 1);
if (isViableSpanForWord(sentence, w)) {
spans.add(w);
}
}
}
return spans.toArray(WordSpan[]::new);
return spans;
}
private boolean isViableSpanForWord(DocumentSentence sentence, WordSpan w) {
for (int i = w.start; i < w.end-1; i++) {
if (sentence.separators[i] == WordSeparator.COMMA) {
return false;
}
}
String word = sentence.constructWordFromSpan(w);
if (word.isBlank() || !WordPatterns.filter(word)) return false;
if (sentence.posTags[w.start].equals("CC")) return false;
if (sentence.posTags[w.end-1].equals("IN")) return false;
if (sentence.posTags[w.end-1].equals("DT")) return false;
if (sentence.posTags[w.end-1].equals("CC")) return false;
if (sentence.posTags[w.end-1].equals("TO")) return false;
return true;
}
private boolean isName(int i, DocumentSentence sentence, Set<String> topWords) {
@ -293,7 +194,6 @@ public class KeywordExtractor {
String posTag = sentence.posTags[i];
// if (posTag.startsWith("N") || posTag.startsWith("V") || posTag.startsWith("R") || posTag.startsWith("J"))
return (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i);
}

View File

@ -98,7 +98,6 @@ public class SentenceExtractor {
}
}
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);

View File

@ -13,23 +13,22 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
HashSet<String> namesKeywords,
HashMap<String, KeywordCounter.WordFrequencyData> wordsTfIdf,
HashMap<String, Integer> positionMask,
EnumSet<EdgePageWordFlags> flagsTemplate,
int quality
EnumSet<EdgePageWordFlags> wordFlagsTemplate
)
{
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
private static final int TF_IDF_HIGH_LIMIT = 64;
public KeywordMetadata(double quality, EnumSet<EdgePageWordFlags> flags) {
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
new HashMap<>(15_000),
new HashMap<>(10_000),
flags,
(int)(-quality));
flags);
}
public KeywordMetadata(double quality) {
this(quality, EnumSet.noneOf(EdgePageWordFlags.class));
public KeywordMetadata() {
this(EnumSet.noneOf(EdgePageWordFlags.class));
}
public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
@ -48,11 +47,7 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
int positions = positionMask.getOrDefault(stemmed, 0);
return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, quality, tfidf.count(), flags).encode();
}
public int quality() {
return -quality;
return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, tfidf.count(), flags).encode();
}
}

View File

@ -1,859 +0,0 @@
package nu.marginalia.util.multimap;
import com.upserve.uppend.blobs.NativeIO;
import lombok.SneakyThrows;
import nu.marginalia.util.btree.BTreeQueryBuffer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import static java.nio.channels.FileChannel.MapMode.READ_ONLY;
import static java.nio.channels.FileChannel.MapMode.READ_WRITE;
import static nu.marginalia.util.FileSizeUtil.readableSize;
public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
private final ArrayList<LongBuffer> buffers = new ArrayList<>();
private final ArrayList<MappedByteBuffer> mappedByteBuffers = new ArrayList<>();
private final FileChannel.MapMode mode;
private final int bufferSize;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final FileChannel channel;
private final long mapSize;
private final long fileLength;
private long mappedSize;
final static long WORD_SIZE = 8;
private NativeIO.Advice defaultAdvice = null;
public static MultimapFileLong forReading(Path file) throws IOException {
long fileSize = Files.size(file);
int bufferSize = getBufferSize(fileSize, false);
return new MultimapFileLong(file.toFile(), READ_ONLY, Files.size(file), bufferSize);
}
public static MultimapFileLong forOutput(Path file, long estimatedSize) throws IOException {
return new MultimapFileLong(file.toFile(), READ_WRITE, 0, getBufferSize(estimatedSize, true));
}
private static int getBufferSize(long totalSize, boolean write) {
int defaultBig = 2<<23;
if (totalSize > Integer.MAX_VALUE/WORD_SIZE) {
return defaultBig;
}
else if (write && totalSize < 8*1024*1024) {
return 8*1024*1024;
}
else {
return (int) Math.min(totalSize, defaultBig);
}
}
public MultimapFileLong(File file,
FileChannel.MapMode mode,
long mapSize,
int bufferSize) throws IOException {
this(new RandomAccessFile(file, translateToRAFMode(mode)), mode, mapSize, bufferSize);
}
private static String translateToRAFMode(FileChannel.MapMode mode) {
if (READ_ONLY.equals(mode)) {
return "r";
} else if (READ_WRITE.equals(mode)) {
return "rw";
}
return "rw";
}
public MultimapFileLong(RandomAccessFile file,
FileChannel.MapMode mode,
long mapSizeBytes,
int bufferSizeWords) throws IOException {
this.mode = mode;
this.bufferSize = bufferSizeWords;
this.mapSize = mapSizeBytes;
this.fileLength = file.length();
channel = file.getChannel();
mappedSize = 0;
logger.trace("Creating multimap file size = {} / buffer size = {}, mode = {}",
readableSize(mapSizeBytes), readableSize(8L*bufferSizeWords), mode);
}
public MultimapSearcherBase createSearcher() {
return new MultimapSearcherBase(this);
}
public MultimapSorter createSorter(Path tmpFile, int internalSortLimit, int minStepSize) {
return new MultimapSorter(this, tmpFile, internalSortLimit, minStepSize);
}
@SneakyThrows
public void advice(NativeIO.Advice advice) {
this.defaultAdvice = advice;
for (var buffer : mappedByteBuffers) {
NativeIO.madvise(buffer, advice);
}
}
@SneakyThrows
public void advice0(NativeIO.Advice advice) {
NativeIO.madvise(mappedByteBuffers.get(0), advice);
}
@SneakyThrows
public void adviceRange(NativeIO.Advice advice, long startLongs, long lengthLongs) {
long endLongs = (startLongs+lengthLongs);
if (endLongs >= mappedSize)
grow(endLongs);
int startIdx = (int)(startLongs / bufferSize);
int endIdx = (int)(endLongs / bufferSize);
if (startIdx != endIdx) {
long offsetStart = (startLongs % bufferSize) * WORD_SIZE;
NativeIO.madviseRange(mappedByteBuffers.get(startIdx), advice, offsetStart, (int) (bufferSize * WORD_SIZE - offsetStart));
for (int i = startIdx+1; i < endIdx; i++) {
NativeIO.madviseRange(mappedByteBuffers.get(i), advice, 0, (int)(bufferSize * WORD_SIZE));
}
NativeIO.madviseRange(mappedByteBuffers.get(endIdx), advice, 0, (int)((endIdx % bufferSize) * WORD_SIZE));
}
else {
var buff = mappedByteBuffers.get(startIdx);
NativeIO.madviseRange(buff, advice, (startLongs % bufferSize) * WORD_SIZE, (int) (lengthLongs * WORD_SIZE));
}
}
public void pokeRange(long offset, long length) {
for (long i = 0; i < length; i += 4096/8) {
get(offset + i);
}
}
public void force() {
logger.trace("Forcing");
for (MappedByteBuffer buffer: mappedByteBuffers) {
buffer.force();
}
}
@SneakyThrows
public void grow(long posIdxRequired) {
if (posIdxRequired*WORD_SIZE > mapSize && mode == READ_ONLY) {
throw new IndexOutOfBoundsException(posIdxRequired + " (max " + mapSize + ")");
}
logger.trace("Growing to encompass {}i/{}b", posIdxRequired, posIdxRequired*WORD_SIZE);
long start;
if (buffers.isEmpty()) {
start = 0;
}
else {
start = (long) buffers.size() * bufferSize;
}
for (long posIdx = start; posIdxRequired >= posIdx; posIdx += bufferSize) {
long posBytes = posIdx * WORD_SIZE;
long bzBytes;
if (mode == READ_ONLY) {
bzBytes = Math.min(WORD_SIZE*bufferSize, mapSize - posBytes);
}
else {
bzBytes = WORD_SIZE*bufferSize;
}
logger.trace("Allocating {}-{}", posBytes, posBytes+bzBytes);
var buffer = channel.map(mode, posBytes, bzBytes);
if (defaultAdvice != null) {
NativeIO.madvise(buffer, defaultAdvice);
}
buffers.add(buffer.asLongBuffer());
mappedByteBuffers.add(buffer);
mappedSize += bzBytes/WORD_SIZE;
}
}
@Override
public long size() {
return fileLength;
}
@Override
public void put(long idx, long val) {
if (idx >= mappedSize)
grow(idx);
try {
buffers.get((int)(idx / bufferSize)).put((int) (idx % bufferSize), val);
}
catch (IndexOutOfBoundsException ex) {
logger.error("Index out of bounds {} -> {}:{} cap {}", idx, buffers.get((int)(idx / bufferSize)), idx % bufferSize,
buffers.get((int)(idx / bufferSize)).capacity());
throw new RuntimeException(ex);
}
}
@Override
public long get(long idx) {
if (idx < 0)
throw new IllegalArgumentException("get("+idx+")");
if (idx >= mappedSize)
grow(idx);
try {
return buffers.get((int)(idx / bufferSize)).get((int)(idx % bufferSize));
}
catch (IndexOutOfBoundsException ex) {
logger.error("Index out of bounds {} -> {}:{} cap {}", idx, buffers.get((int)(idx / bufferSize)), idx % bufferSize,
buffers.get((int)(idx / bufferSize)).capacity());
throw new RuntimeException(ex);
}
}
@Override
public void read(long[] vals, long idx) {
read(vals, vals.length, idx);
}
@Override
public void read(long[] vals, int n, long idx) {
if (idx+n >= mappedSize) {
grow(idx+n);
}
int iN = (int)((idx + n) / bufferSize);
for (int i = 0; i < n; ) {
int i0 = (int)((idx + i) / bufferSize);
int bufferOffset = (int) ((idx+i) % bufferSize);
var buffer = buffers.get(i0);
final int l;
if (i0 < iN) l = bufferSize - bufferOffset;
else l = Math.min(n - i, bufferSize - bufferOffset);
buffer.get(bufferOffset, vals, i, l);
i+=l;
}
}
@Override
public void read(LongBuffer vals, long idx) {
int n = vals.limit() - vals.position();
if (idx+n >= mappedSize) {
grow(idx+n);
}
int iN = (int)((idx + n) / bufferSize);
for (int i = 0; i < n; ) {
int i0 = (int)((idx + i) / bufferSize);
int bufferOffset = (int) ((idx+i) % bufferSize);
var buffer = buffers.get(i0);
final int l;
if (i0 < iN) l = bufferSize - bufferOffset;
else l = Math.min(n - i, bufferSize - bufferOffset);
vals.put(vals.position() + i, buffer, bufferOffset, l);
i+=l;
}
}
@Override
public void write(long[] vals, long idx) {
write(vals, vals.length, idx);
}
@Override
public void write(long[] vals, int n, long idx) {
if (idx+n >= mappedSize) {
grow(idx+n);
}
int iN = (int)((idx + n) / bufferSize);
for (int i = 0; i < n; ) {
int i0 = (int)((idx + i) / bufferSize);
int bufferOffset = (int) ((idx+i) % bufferSize);
var buffer = buffers.get(i0);
final int l;
if (i0 < iN) l = bufferSize - bufferOffset;
else l = Math.min(n - i, bufferSize - bufferOffset);
buffer.put(bufferOffset, vals, i, l);
i+=l;
}
}
@Override
public void write(LongBuffer vals, long idx) {
int n = vals.limit() - vals.position();
if (idx+n >= mappedSize) {
grow(idx+n);
}
int iN = (int)((idx + n) / bufferSize);
for (int i = 0; i < n; ) {
int i0 = (int)((idx + i) / bufferSize);
int bufferOffset = (int) ((idx+i) % bufferSize);
var buffer = buffers.get(i0);
final int l;
if (i0 < iN) l = bufferSize - bufferOffset;
else l = Math.min(n - i, bufferSize - bufferOffset);
buffer.put(bufferOffset, vals, vals.position() + i, l);
i+=l;
}
}
@Override
public void write(LongBuffer vals, int n, long idx) {
if (idx+n >= mappedSize) {
grow(idx+n);
}
int iN = (int)((idx + n) / bufferSize);
for (int i = 0; i < n; ) {
int i0 = (int)((idx + i) / bufferSize);
int bufferOffset = (int) ((idx+i) % bufferSize);
var buffer = buffers.get(i0);
final int l;
if (i0 < iN) l = bufferSize - bufferOffset;
else l = Math.min(n - i, bufferSize - bufferOffset);
buffer.put(bufferOffset, vals, vals.position() + i, l);
i+=l;
}
}
@Override
public void swapn(int n, long idx1, long idx2) {
for (int i = 0; i < n; i++)
swap(idx1+i, idx2+i);
}
private void swap(long idx1, long idx2) {
LongBuffer buff1 = buffers.get((int)(idx1 / bufferSize));
final int o1 = (int) (idx1 % bufferSize);
LongBuffer buff2 = buffers.get((int)(idx2 / bufferSize));
final int o2 = (int) (idx2 % bufferSize);
long tmp = buff1.get(o1);
buff1.put(o1, buff2.get(o2));
buff2.put(o2, tmp);
}
@Override
public void setRange(long idx, int n, long val) {
if (n == 0) return;
if (idx+n >= mappedSize) {
grow(idx+n);
}
int iN = (int)((idx + n) / bufferSize);
for (int i = 0; i < n; ) {
int i0 = (int)((idx + i) / bufferSize);
int bufferOffset = (int) ((idx+i) % bufferSize);
var buffer = buffers.get(i0);
final int l;
if (i0 < iN) l = bufferSize - bufferOffset;
else l = Math.min(n - i, bufferSize - bufferOffset);
for (int p = 0; p < l; p++) {
buffer.put(bufferOffset + p, val);
}
i+=l;
}
}
@Override
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
int length = (int)(sourceEnd - sourceStart);
if (destOffset+length >= mappedSize) {
grow(destOffset+length);
}
int i0 = (int)((destOffset) / bufferSize);
int iN = (int)((destOffset + length) / bufferSize);
int numBuffers = iN - i0 + 1;
ByteBuffer[] buffers = new ByteBuffer[numBuffers];
for (int i = 0; i < numBuffers; i++) {
buffers[i] = mappedByteBuffers.get(i0 + i);
buffers[i].clear();
}
if (i0 != iN) {
int startBuf0 = (int) ((destOffset) % bufferSize) * 8;
int endBuf0 = buffers[0].capacity() - (int) (destOffset % bufferSize) * 8;
int endBufN = (int)((destOffset + length) % bufferSize)*8;
buffers[0] = buffers[0].slice(startBuf0, endBuf0);
buffers[numBuffers-1] = buffers[numBuffers-1].slice(0, endBufN);
}
else {
buffers[0] = buffers[0].slice((int) (destOffset % bufferSize) * 8, 8*length);
}
sourceChannel.position(sourceStart*8);
long twb = 0;
while (twb < length * 8L) {
long rb = sourceChannel.read(buffers, 0, buffers.length);
if (rb < 0)
throw new IOException();
twb += rb;
}
}
@Override
public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
if (fromIndex + n*step >= mappedSize)
grow(fromIndex + n*step);
long low = 0;
long high = n - 1;
if (isSameBuffer(fromIndex, fromIndex+step*n)) {
int idx = (int)(fromIndex / bufferSize);
var buffer = buffers.get(idx);
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid*step;
long midVal = buffer.get((int)(off % bufferSize)) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid*step;
}
}
else {
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid*step;
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid*step;
}
}
return -1L-(fromIndex + high*step);
}
@Override
public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
if (fromIndex + n >= mappedSize)
grow(fromIndex + n);
long low = 0;
long high = n - 1;
if (isSameBuffer(fromIndex, fromIndex+n)) {
int idx = (int)(fromIndex / bufferSize);
var buffer = buffers.get(idx);
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffer.get((int)(off % bufferSize)) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
else {
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
return -1L-(fromIndex + high);
}
@Override
public long binarySearchInternal(long key, long fromIndex, long n) {
if (fromIndex + n >= mappedSize)
grow(fromIndex + n);
long low = 0;
long high = n - 1;
if (isSameBuffer(fromIndex, fromIndex+n)) {
int idx = (int)(fromIndex / bufferSize);
var buffer = buffers.get(idx);
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffer.get((int)(off % bufferSize));
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
else {
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
return -1L-(fromIndex + high);
}
@Override
public long binarySearchUpperInternal(long key, long fromIndex, long n) {
if (fromIndex + n >= mappedSize)
grow(fromIndex + n);
long low = 0;
long high = n - 1;
if (isSameBuffer(fromIndex, fromIndex+n)) {
int idx = (int)(fromIndex / bufferSize);
var buffer = buffers.get(idx);
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffer.get((int)(off % bufferSize));
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
else {
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
return fromIndex + low;
}
private boolean isSameBuffer(long a, long b) {
return a / bufferSize == b/bufferSize;
}
@Override
public long quickSortPartition(int wordSize, long low, long high) {
if (high >= mappedSize)
grow(high + wordSize - 1);
if (isSameBuffer(low, high + wordSize - 1)) {
// Specialization that circumvents the need for expensive calls to
// MultimapFileLong.get() in the most common scenario
return quickSortPartitionSameBuffer(wordSize, low, high);
}
else {
return quickSortPartitionDifferentBuffers(wordSize, low, high);
}
}
@Override
public void insertionSort(int wordSize, long start, int n) {
if (start + n + wordSize - 1 >= mappedSize)
grow(start + n + wordSize - 1);
if (n <= 1) {
return;
}
if (isSameBuffer(start, start + (long)n*wordSize-1L)) {
final var buffer = buffers.get((int) (start / bufferSize));
int off = (int) (start % bufferSize);
for (int i = 1; i < n; i++) {
long key = buffer.get(off + i * wordSize);
int j = i - 1;
while (j >= 0 && buffer.get(off + wordSize*j) > key) {
for (int w = 0; w < wordSize; w++) {
long tmp = buffer.get(off+wordSize*j+w);
buffer.put(off+wordSize*j+w, buffer.get(off+wordSize*(j+1)+w));
buffer.put(off+wordSize*(j+1)+w, tmp);
}
j--;
}
buffer.put(off + (j+1) * wordSize, key);
}
}
else for (int i = 1; i < n; i++) {
long key = get(start + (long) i * wordSize);
int j = i - 1;
while (j >= 0 && get(start + (long)wordSize*j) > key) {
swapn(wordSize, start + (long)wordSize*j, start + (long)wordSize*(j+1));
j--;
}
put(start + (long) (j+1) * wordSize, key);
}
}
private long quickSortPartitionDifferentBuffers(int wordSize, long low, long high) {
long pivotPoint = ((low + high) / (2L*wordSize)) * wordSize;
long pivot = get(pivotPoint);
long i = low - wordSize;
long j = high + wordSize;
for (;;) {
do {
i+=wordSize;
} while (get(i) < pivot);
do {
j-=wordSize;
}
while (get(j) > pivot);
if (i >= j) return j;
else swapn(wordSize, i, j);
}
}
private long quickSortPartitionSameBuffer(int wordSize, long low, long high) {
final var buffer = buffers.get((int) (low / bufferSize));
final long pivotPointLong = ((low + high) / (2L*wordSize)) * wordSize;
final int pivotPoint = (int) (pivotPointLong % bufferSize);
final long pivot = buffer.get(pivotPoint);
int j = (int) (high % bufferSize) + wordSize;
int i = (int) (low % bufferSize) - wordSize;
long j0 = high + wordSize - j;
for (;;) {
do {
i+=wordSize;
} while (buffer.get(i) < pivot);
do {
j-=wordSize;
}
while (buffer.get(j) > pivot);
if (i >= j) return j0 + j;
else {
for (int w = 0; w < wordSize; w++) {
long tmp = buffer.get(i+w);
buffer.put(i+w, buffer.get(j+w));
buffer.put(j+w, tmp);
}
}
}
}
public void retain(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
final long end = searchStart + stepSize * numEntries;
if (end < mappedSize) {
grow(end);
}
long bv = buffer.currentValue() & mask;
long av = get(searchStart) & mask;
long pos = searchStart;
int bi = (int)(searchStart / bufferSize);
int bo = (int)(searchStart % bufferSize);
LongBuffer data = buffers.get(bi);
while (bv <= boundary && buffer.hasMore()) {
if (bv < av) {
if (!buffer.rejectAndAdvance()) break;
bv = buffer.currentValue() & mask;
continue;
}
else if (bv == av) {
if (!buffer.retainAndAdvance()) break;
bv = buffer.currentValue() & mask;
continue;
}
pos += stepSize;
if (pos < end) {
bo += stepSize;
if (bo >= bufferSize) {
data = buffers.get(++bi);
bo = 0;
}
av = data.get(bo) & mask;
}
else {
break;
}
}
}
public void reject(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
final long end = searchStart + stepSize * numEntries;
if (end < mappedSize) {
grow(end);
}
long bv = buffer.currentValue() & mask;
long av = get(searchStart) & mask;
long pos = searchStart;
int bi = (int)(searchStart / bufferSize);
int bo = (int)(searchStart % bufferSize);
LongBuffer data = buffers.get(bi);
while (bv <= boundary && buffer.hasMore()) {
if (bv < av) {
if (!buffer.retainAndAdvance()) break;
bv = buffer.currentValue() & mask;
continue;
}
else if (bv == av) {
if (!buffer.rejectAndAdvance()) break;
bv = buffer.currentValue() & mask;
continue;
}
pos += stepSize;
if (pos < end) {
bo += stepSize;
if (bo >= bufferSize) {
data = buffers.get(++bi);
bo = 0;
}
av = data.get(bo) & mask;
}
else {
break;
}
}
}
@Override
public void close() throws IOException {
force();
mappedByteBuffers.clear();
buffers.clear();
channel.close();
// I want to believe
System.runFinalization();
System.gc();
}
}

View File

@ -1,120 +0,0 @@
package nu.marginalia.util.multimap;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
private final long off;
private final MultimapFileLongSlice map;
public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) {
this.off = off;
this.map = map;
}
@Override
public long size() {
return map.size() - off;
}
@Override
public void put(long idx, long val) {
map.put(off+idx, val);
}
@Override
public void setRange(long idx, int n, long val) {
map.setRange(off+idx, n, val);
}
@Override
public long get(long idx) {
return map.get(off+idx);
}
@Override
public void read(long[] vals, long idx) {
map.read(vals, idx+off);
}
@Override
public void read(long[] vals, int n, long idx) {
map.read(vals, n, idx+off);
}
@Override
public void read(LongBuffer vals, long idx) { map.read(vals, idx+off); }
@Override
public void write(long[] vals, long idx) {
map.write(vals, idx+off);
}
@Override
public void write(long[] vals, int n, long idx) {
map.write(vals, n, idx+off);
}
@Override
public void write(LongBuffer vals, long idx) {
map.write(vals, idx+off);
}
@Override
public void write(LongBuffer vals, int n, long idx) {
map.write(vals, n,idx+off);
}
@Override
public void swapn(int n, long idx1, long idx2) {
map.swapn(n, idx1+off, idx2+off);
}
@Override
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
throws IOException {
map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd);
}
@Override
public MultimapFileLongSlice atOffset(long off) {
// If we don't override this, the default implementation would build a pyramid of
// MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...)))
// if this is called iteratively (e.g. to walk over a file)
return new MultimapFileLongOffsetSlice(map, this.off + off);
}
@Override
public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
throw new UnsupportedOperationException();
}
@Override
public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
throw new UnsupportedOperationException();
}
@Override
public long binarySearchInternal(long key, long fromIndex, long n) {
throw new UnsupportedOperationException();
}
@Override
public long binarySearchUpperInternal(long key, long fromIndex, long n) {
throw new UnsupportedOperationException();
}
@Override
public long quickSortPartition(int wordSize, long low, long highInclusive) {
return map.quickSortPartition(wordSize, low+off, highInclusive+off);
}
@Override
public void insertionSort(int wordSize, long start, int n) {
map.insertionSort(wordSize, start+off, n);
}
}

View File

@ -1,47 +0,0 @@
package nu.marginalia.util.multimap;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
public interface MultimapFileLongSlice {
long size();
void put(long idx, long val);
void setRange(long idx, int n, long val);
long get(long idx);
void read(long[] vals, long idx);
void read(long[] vals, int n, long idx);
void read(LongBuffer vals, long idx);
void write(long[] vals, long idx);
void write(long[] vals, int n, long idx);
void write(LongBuffer vals, long idx);
void write(LongBuffer vals, int n, long idx);
void swapn(int n, long idx1, long idx2);
void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;
default MultimapFileLongSlice atOffset(long off) {
return new MultimapFileLongOffsetSlice(this, off);
}
long binarySearchInternal(long key, long fromIndex, int step, long n, long mask);
long binarySearchInternal(long key, long fromIndex, long n, long mask);
long binarySearchInternal(long key, long fromIndex, long n);
long binarySearchUpperInternal(long key, long fromIndex, long n);
long quickSortPartition(int wordSize, long low, long highInclusive);
void insertionSort(int wordSize, long start, int n);
}

View File

@ -1,80 +0,0 @@
package nu.marginalia.util.multimap;
public interface MultimapSearcher {
long binarySearchLower(long key, long fromIndex, long n);
long binarySearch(long key, long fromIndex, long n);
static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
if (mask == ~0L && stepSize == 1) {
return new SimpleMultimapSearcher(new MultimapSearcherBase(slice));
}
else if (stepSize == 1) {
return new MaskedMultimapSearcher(new MultimapSearcherBase(slice), mask);
}
else {
return new SteppingMaskedMultimapSearcher(new MultimapSearcherBase(slice), mask, stepSize);
}
}
}
class SimpleMultimapSearcher implements MultimapSearcher {
private final MultimapSearcherBase base;
SimpleMultimapSearcher(MultimapSearcherBase base) {
this.base = base;
}
@Override
public long binarySearchLower(long key, long fromIndex, long n) {
return base.binarySearchLower(key, fromIndex, n);
}
@Override
public long binarySearch(long key, long fromIndex, long n) {
return base.binarySearch(key, fromIndex, n);
}
}
class MaskedMultimapSearcher implements MultimapSearcher {
private final MultimapSearcherBase base;
private final long mask;
MaskedMultimapSearcher(MultimapSearcherBase base, long mask) {
this.base = base;
this.mask = mask;
}
@Override
public long binarySearchLower(long key, long fromIndex, long n) {
return base.binarySearchLower(key, fromIndex, n, mask);
}
@Override
public long binarySearch(long key, long fromIndex, long n) {
return base.binarySearch(key, fromIndex, n, mask);
}
}
class SteppingMaskedMultimapSearcher implements MultimapSearcher {
private final MultimapSearcherBase base;
private final long mask;
private final int step;
SteppingMaskedMultimapSearcher(MultimapSearcherBase base, long mask, int step) {
this.base = base;
this.mask = mask;
this.step = step;
}
@Override
public long binarySearchLower(long key, long fromIndex, long n) {
return base.binarySearchLower(key, fromIndex, step, n, mask);
}
@Override
public long binarySearch(long key, long fromIndex, long n) {
return base.binarySearch(key, fromIndex, step, n, mask);
}
}

View File

@ -1,86 +0,0 @@
package nu.marginalia.util.multimap;
import lombok.experimental.Delegate;
public class MultimapSearcherBase {
@Delegate
private final MultimapFileLongSlice mmf;
public MultimapSearcherBase(MultimapFileLongSlice mmf) {
this.mmf = mmf;
}
public boolean binarySearchTest(long key, long fromIndex, long n) {
long low = 0;
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return true;
}
return false;
}
public long binarySearchLower(long key, long fromIndex, long n) {
return mmf.binarySearchUpperInternal(key, fromIndex, n);
}
public long binarySearchLower(long key, long fromIndex, long n, long mask) {
long low = 0;
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
return fromIndex + low;
}
public long binarySearchLower(long key, long fromIndex, int step, long n, long mask) {
long low = 0;
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid*step) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid*step;
}
return fromIndex + low*step;
}
public long binarySearch(long key, long fromIndex, long n) {
return mmf.binarySearchInternal(key, fromIndex, n);
}
public long binarySearch(long key, long fromIndex, long n, long mask) {
return mmf.binarySearchInternal(key, fromIndex, n, mask);
}
public long binarySearch(long key, long fromIndex, int step, long n, long mask) {
return mmf.binarySearchInternal(key, fromIndex, step, n, mask);
}
}

View File

@ -1,151 +0,0 @@
package nu.marginalia.util.multimap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;
public class MultimapSorter {
private final Path tmpFileDir;
private final MultimapFileLongSlice multimapFileLong;
private final LongBuffer buffer;
private final int internalSortLimit;
private final int wordSize;
private static final Logger logger = LoggerFactory.getLogger(MultimapSorter.class);
public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit, int wordSize) {
this.multimapFileLong = multimapFileLong;
this.tmpFileDir = tmpFileDir;
this.internalSortLimit = internalSortLimit;
this.wordSize = wordSize;
buffer = ByteBuffer.allocateDirect(internalSortLimit * wordSize * 8).asLongBuffer();
}
public void sortRange(long start, long end) throws IOException {
if (end - start < internalSortLimit) {
quickSortLH(start, end - wordSize);
}
else {
mergeSort(start, (int) (end - start));
}
if (MultimapSorter.class.desiredAssertionStatus()) {
for (long lp = start + wordSize; lp < end; lp += wordSize) {
if (multimapFileLong.get(lp - wordSize) > multimapFileLong.get(lp)) {
logger.error("Sort contract breached [{}:{} ({}), ws={}, <isl={}, bc={}]",
start, end,
end - start,
wordSize, end - start < internalSortLimit,
buffer.capacity());
}
}
}
}
public void mergeSort(long start, int lengthLongs) throws IOException {
if (lengthLongs == 1)
return;
if (lengthLongs < buffer.capacity()) {
mergeSort(start, lengthLongs, buffer);
}
else {
Path tmpFile = Files.createTempFile(tmpFileDir,"sort-"+start+"-"+(start+lengthLongs), ".dat");
try (var raf = new RandomAccessFile(tmpFile.toFile(), "rw"); var channel = raf.getChannel()) {
var workBuffer =
channel.map(FileChannel.MapMode.READ_WRITE, 0, wordSize * lengthLongs * WORD_SIZE)
.asLongBuffer();
mergeSort(start, lengthLongs, workBuffer);
}
finally {
tmpFile.toFile().delete();
}
}
}
private void mergeSort(long start, int length, LongBuffer workBuffer) throws IOException {
int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(buffer.capacity()));
// Do in-memory sorting up until internalSortLimit first
for (int i = 0; i < length; i += width) {
quickSort(start + i, Math.min(width, length-i));
}
// Then finish with merge sort
for (; width < length; width*=2) {
for (int i = 0; i < length; i += 2*width) {
merge(start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer);
}
workBuffer.clear();
multimapFileLong.write(workBuffer, length, start);
}
}
void merge(long offset, int left, int right, int end, LongBuffer workBuffer) {
long idxL = left;
long idxR = right;
for (int putPos = left; putPos < end; putPos+= wordSize) {
final long bufferL = multimapFileLong.get(offset+idxL);
final long bufferR = multimapFileLong.get(offset+idxR);
if (idxL < right && (idxR >= end || bufferL < bufferR)) {
workBuffer.put(putPos, bufferL);
for (int s = 1; s < wordSize; s++) {
workBuffer.put(putPos + s, multimapFileLong.get(offset + idxL + s));
}
idxL+= wordSize;
}
else {
workBuffer.put(putPos, bufferR);
for (int s = 1; s < wordSize; s++) {
workBuffer.put(putPos + s, multimapFileLong.get(offset + idxR + s));
}
idxR+= wordSize;
}
}
}
public void insertionSort(long start, int n) {
multimapFileLong.insertionSort(wordSize, start, n);
}
private void swap(long a, long b) {
multimapFileLong.swapn(wordSize, a, b);
}
public void quickSort(long start, long length) {
quickSortLH(start, start + length - wordSize);
}
public void quickSortLH(long low, long highInclusive) {
if (low >= 0 && highInclusive >= 0 && low < highInclusive) {
if (highInclusive - low < 32) {
multimapFileLong.insertionSort(wordSize, low, (int) ((wordSize + highInclusive - low) / wordSize));
}
else {
long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive);
quickSortLH(low, p);
quickSortLH(p + wordSize, highInclusive);
}
}
}
}

View File

@ -4,17 +4,18 @@ import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntComparator;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import org.roaringbitmap.RoaringBitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.*;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Set;
import java.util.function.IntToDoubleFunction;
import java.util.stream.IntStream;
import it.unimi.dsi.fastutil.ints.IntArrays;
public abstract class RankingAlgorithm {
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
@ -154,7 +155,7 @@ public abstract class RankingAlgorithm {
}
public TIntList pageRank(int resultCount) {
public RoaringBitmap pageRank(int resultCount) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
@ -176,7 +177,7 @@ public abstract class RankingAlgorithm {
return rank.getRanking(resultCount);
}
public TIntList pageRankWithPeripheralNodes(int resultCount) {
public RoaringBitmap pageRankWithPeripheralNodes(int resultCount) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
@ -303,7 +304,7 @@ public abstract class RankingAlgorithm {
return list;
}
public TIntList getRanking(int numResults) {
public RoaringBitmap getRanking(int numResults) {
if (numResults < 0) {
numResults = domainIdToIndex.size();
}
@ -311,7 +312,7 @@ public abstract class RankingAlgorithm {
numResults = rank.length;
}
TIntArrayList list = new TIntArrayList(numResults);
RoaringBitmap list = new RoaringBitmap();
int[] nodes = new int[rank.length];
Arrays.setAll(nodes, i->i);

View File

@ -2,7 +2,7 @@ package nu.marginalia.util.ranking;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -87,9 +87,9 @@ public class RankingDomainFetcher {
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
stmt.setString(1, pattern);
var rsp = stmt.executeQuery();
var stmt = conn.createStatement()) {
// This is sourced from a config file --v
var rsp = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE '" + pattern + "'");
while (rsp.next()) {
idConsumer.accept(rsp.getInt(1));
}

View File

@ -14,7 +14,7 @@ import nu.marginalia.util.ranking.RankingAlgorithm;
import nu.marginalia.util.ranking.RankingDomainData;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -5,7 +5,7 @@ import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -51,14 +51,14 @@ public class UpdateDomainRanksTool {
rankMax = spr.size()*2;
uploader.start();
spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
var rankData = spr.pageRankWithPeripheralNodes(rankMax);
for (int i : rankData) {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
e.printStackTrace();
}
return true;
});
}
long end = System.currentTimeMillis();
running = false;

View File

@ -5,7 +5,7 @@ import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -41,14 +41,14 @@ public class UpdateDomainRanksTool2 {
rankMax = rpr.size();
uploader.start();
rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
var rankData = rpr.pageRankWithPeripheralNodes(rankMax);
for (int i : rankData) {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
e.printStackTrace();
}
return true;
});
}
long end = System.currentTimeMillis();
running = false;

View File

@ -7,7 +7,7 @@ import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows;
import nu.marginalia.util.AndCardIntSet;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDaoImpl;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import org.roaringbitmap.RoaringBitmap;

View File

@ -0,0 +1,246 @@
package nu.marginalia.util.tool;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import lombok.SneakyThrows;
import nu.marginalia.util.AndCardIntSet;
import org.roaringbitmap.RoaringBitmap;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static nu.marginalia.util.AndCardIntSet.andCardinality;
import static nu.marginalia.util.AndCardIntSet.weightedProduct;
public class EdgeWordWordConsineSimilarityMain {
final Object2IntOpenHashMap<String> stringIds;
final AndCardIntSet[] dToSMap;
final float[] weights;
final boolean useWeights = false;
enum Direction {
S_TO_D,
D_TO_S
}
final Direction direction = Direction.D_TO_S;
public EdgeWordWordConsineSimilarityMain(Path dataFile) throws IOException {
System.out.println("String IDs");
stringIds = mapStringsToIds(dataFile);
System.out.println("DtoS Map");
dToSMap = constructDtoSMap(dataFile, stringIds);
System.out.println("Weights");
if (useWeights) {
weights = new float[stringIds.size()];
for (int i = 0; i < stringIds.size(); i++) {
weights[i] = getWeight(i);
}
}
else {
weights = null;
}
System.out.println("Ready");
}
private Object2IntOpenHashMap<String> mapStringsToIds(Path dataFile) throws IOException {
Object2IntOpenHashMap<String> stringIds = new Object2IntOpenHashMap<>(15_000_000);
try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) {
lines.forEach(line -> {
int tab = line.indexOf('\t');
if (tab <= 0)
return;
// direction doesn't matter here
String from = line.substring(0, tab);
String to = line.substring(tab + 1);
stringIds.putIfAbsent(from, stringIds.size());
stringIds.putIfAbsent(to, stringIds.size());
});
}
return stringIds;
}
private AndCardIntSet[] constructDtoSMap(Path dataFile, Object2IntOpenHashMap<String> stringIds) throws IOException {
Map<Integer, RoaringBitmap> tmpMap = new HashMap<>(15_000_000);
try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) {
lines.forEach(line -> {
int tab = line.indexOf('\t');
if (tab <= 0) return;
String from, to;
if (direction == Direction.S_TO_D) {
from = line.substring(0, tab);
to = line.substring(tab + 1);
}
else {
from = line.substring(tab + 1);
to = line.substring(0, tab);
}
tmpMap.computeIfAbsent(stringIds.getInt(to), this::createBitmapWithSelf).add(stringIds.getInt(from));
});
}
AndCardIntSet[] dToSMap = new AndCardIntSet[stringIds.size()];
tmpMap.entrySet().stream()
.filter(e -> isEligible(e.getValue()))
.forEach(e -> dToSMap[e.getKey()] = AndCardIntSet.of(e.getValue()));
return dToSMap;
}
private boolean isEligible(RoaringBitmap value) {
int cardinality = value.getCardinality();
return cardinality > 50;
}
@SneakyThrows
public void tryDomains(String... word) {
System.out.println(Arrays.toString(word));
int[] domainIds = Arrays.stream(word).mapToInt(stringIds::getInt).toArray();
long start = System.currentTimeMillis();
findAdjacentDtoS(new IntOpenHashSet(domainIds), similarities -> {
Set<Integer> ids = similarities.similarities().stream().map(Similarity::id).collect(Collectors.toSet());
Map<Integer, String> reveseIds = new HashMap<>(similarities.similarities.size());
stringIds.forEach((str, id) -> {
if (ids.contains(id)) {
reveseIds.put(id, str);
}
});
for (var similarity : similarities.similarities()) {
System.out.println(reveseIds.get(similarity.id) + "\t" + dToSMap[similarity.id].getCardinality() + "\t" + prettyPercent(similarity.value));
}
});
System.out.println(System.currentTimeMillis() - start);
}
private String prettyPercent(double val) {
return String.format("%2.2f%%", 100. * val);
}
public RoaringBitmap createBitmapWithSelf(int val) {
var bm = new RoaringBitmap();
bm.add(val);
return bm;
}
double cosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
double andCardinality = andCardinality(a, b);
andCardinality /= Math.sqrt(a.getCardinality());
andCardinality /= Math.sqrt(b.getCardinality());
return andCardinality;
}
double expensiveCosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights));
}
float getWeight(int i) {
var vector = dToSMap[i];
if (vector == null) return 1.0f;
return 1.0f / (float) Math.log(2+vector.getCardinality());
}
record Similarities(int id, List<Similarity> similarities) {};
record Similarity(int id, double value) {};
@SneakyThrows
private void findAdjacentDtoS(IntSet ids, Consumer<Similarities> andThen) {
AndCardIntSet[] vectors = ids.intStream().mapToObj(id -> dToSMap[id]).toArray(AndCardIntSet[]::new);
for (var vector : vectors) {
if (null == vector)
return;
}
var vector = Arrays.stream(vectors).reduce(AndCardIntSet::and).orElseThrow();
List<Similarity> similarities = IntStream.range(0, dToSMap.length).parallel().mapToObj(
id -> vectorSimilarity(ids, vector, id))
.filter(Objects::nonNull)
.sorted(Comparator.comparing(Similarity::value))
.toList();
andThen.accept(new Similarities(0, similarities));
}
double cardinalityLimit = 0.1;
private Similarity vectorSimilarity(IntSet ids, AndCardIntSet vector, int id) {
/* The minimum cardinality a vector can have so that
*
* a (x) b
* ------- < k is given by k^2
* |a||b|
*
*/
final double cardMin = Math.min(2, cardinalityLimit * cardinalityLimit * vector.getCardinality());
if (ids.contains(id) || id >= dToSMap.length)
return null;
var otherVec = dToSMap[id];
if (otherVec == null || otherVec.getCardinality() < cardMin)
return null;
double similarity = cosineSimilarity(vector, otherVec);
if (similarity > 0.1) {
if (useWeights) {
var recalculated = expensiveCosineSimilarity(vector, otherVec);
if (recalculated > 0.1) {
return new Similarity(id, recalculated);
}
}
else {
return new Similarity(id, similarity);
}
}
return null;
}
public static void main(String[] args) throws IOException {
var main = new EdgeWordWordConsineSimilarityMain(Path.of(args[0]));
for (;;) {
String line = System.console().readLine("Words> ");
if (line == null || line.isBlank()) {
break;
}
main.tryDomains(line.split("\\s+"));
}
}
}

View File

@ -0,0 +1,43 @@
package nu.marginalia.util.tool;
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import org.jsoup.Jsoup;
import java.util.HashSet;
import java.util.Set;
public class WikipediaInternalLinkExtractorMain {
public static void main(String... args) throws InterruptedException {
new WikipediaReader(args[0], new EdgeDomain("en.wikipedia.org"), wikipediaArticle -> {
var doc = Jsoup.parse(wikipediaArticle.body);
String path = wikipediaArticle.url.path.substring("/wiki/".length());
if (isIncluded(path)) {
Set<String> seen = new HashSet<>(100);
for (var atag : doc.getElementsByTag("a")) {
String href = atag.attr("href");
if (href.contains("#")) {
href = href.substring(0, href.indexOf('#'));
}
if (isIncluded(href) && href.length() > 2 && seen.add(href)) {
System.out.println(path + "\t" + href);
}
}
}
}).join();
}
private static boolean isIncluded(String href) {
return !href.contains(":")
&& !href.contains("/")
&& !href.contains("%")
&& !href.startsWith("#");
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.api.model;
import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
@ -33,11 +34,11 @@ public class ApiSearchResult {
for (var entries : bySet.values()) {
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
for (var entry : entries) {
var metadata = entry.metadata();
var metadata = new EdgePageWordMetadata(entry.encodedWordMetadata());
if (metadata.isEmpty())
continue outer;
Set<String> flags = metadata.flags().stream().map(Object::toString).collect(Collectors.toSet());
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), metadata.count(), flags));
}
details.add(lst);

View File

@ -14,7 +14,6 @@ import nu.marginalia.wmsa.podcasts.PodcastScraperMain;
import nu.marginalia.wmsa.renderer.RendererMain;
import nu.marginalia.wmsa.resource_store.ResourceStoreMain;
import nu.marginalia.wmsa.smhi.scraper.SmhiScraperMain;
import org.apache.logging.log4j.core.lookup.MainMapLookup;
import java.util.Map;
import java.util.stream.Collectors;
@ -78,7 +77,6 @@ public enum ServiceDescriptor {
}
public static void main(String... args) {
MainMapLookup.setMainArguments(args);
Map<String, Command> functions = Stream.of(
new ListCommand(),
new StartCommand(),

View File

@ -4,7 +4,7 @@ import com.google.common.base.Strings;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import org.apache.commons.io.IOUtils;

View File

@ -6,6 +6,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
@ -62,7 +63,7 @@ public class ConversionLog implements AutoCloseable, Interpreter {
}
@Override
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) {}
@Override
public void loadDomainRedirect(DomainLink link) {}

View File

@ -27,7 +27,6 @@ public class ConverterModule extends AbstractModule {
bind(Gson.class).toInstance(createGson());
bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.);
bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.);
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250);
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);

View File

@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
@ -120,7 +121,8 @@ public class LoadInstructionWriter {
}
@Override
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) {
}
@Override
public void loadDomainRedirect(DomainLink link) {}

View File

@ -8,6 +8,8 @@ import okhttp3.Request;
import okhttp3.RequestBody;
import okio.BufferedSink;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URL;
@ -15,9 +17,9 @@ import java.nio.charset.Charset;
import java.sql.SQLException;
import java.util.concurrent.TimeUnit;
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
public class ReindexTriggerMain {
private static final Logger logger = LoggerFactory.getLogger(ReindexTriggerMain.class);
public static void main(String... args) throws IOException, SQLException {
var db = new DatabaseModule();
@ -28,6 +30,7 @@ public class ReindexTriggerMain {
.followRedirects(true)
.build();
logger.info("Updating statistics");
var updateStatistics = new UpdateDomainStatistics(db.provideConnection());
updateStatistics.run();
@ -45,15 +48,10 @@ public class ReindexTriggerMain {
}
};
logger.info("Repartitioning");
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute();
if (!Boolean.getBoolean("no-preconvert")) {
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute();
}
for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) {
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute();
}
logger.info("Reindexing");
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex")).build()).execute();
}

View File

@ -7,8 +7,6 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedD
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.index.model.IndexBlockType;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import java.util.List;
@ -41,18 +39,8 @@ public class DocumentsCompiler {
var words = doc.words;
if (words != null) {
var wordsArray = words.values().stream()
.filter(this::filterNonTransients)
.map(DocumentKeywords::new)
.toArray(DocumentKeywords[]::new);
ret.add(new LoadKeywords(doc.url, wordsArray));
ret.add(new LoadKeywords(doc.url, doc.details.metadata, new DocumentKeywords(words)));
}
}
private boolean filterNonTransients(EdgePageWords words) {
return words.block.type != IndexBlockType.TRANSIENT;
}
}

View File

@ -4,6 +4,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
@ -18,7 +19,7 @@ public interface Interpreter {
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
void loadKeywords(EdgeUrl url, DocumentKeywords[] words);
void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words);
void loadDomainRedirect(DomainLink link);
}

View File

@ -1,18 +1,16 @@
package nu.marginalia.wmsa.edge.converting.interpreter.instruction;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import java.util.Arrays;
public record DocumentKeywords(IndexBlock block,
public record DocumentKeywords(
String[] keywords,
long[] metadata) {
public DocumentKeywords(EdgePageWords words) {
this(words.block,
words.words.toArray(String[]::new),
this(words.words.toArray(String[]::new),
words.metadata.toArray());
}
@ -20,7 +18,7 @@ public record DocumentKeywords(IndexBlock block,
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(getClass().getSimpleName());
sb.append('[').append(block).append(", ");
sb.append('[');
for (int i = 0; i < keywords.length; i++) {
sb.append("\n\t ");
if (metadata[i] != 0) {
@ -42,6 +40,6 @@ public record DocumentKeywords(IndexBlock block,
}
public DocumentKeywords subList(int start, int end) {
return new DocumentKeywords(block, Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end));
return new DocumentKeywords(Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end));
}
}

View File

@ -1,12 +1,10 @@
package nu.marginalia.util;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
package nu.marginalia.wmsa.edge.converting.interpreter.instruction;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class ListChunker {
public class KeywordListChunker {
/** Chops data into a list of lists of max length size
*

View File

@ -3,20 +3,19 @@ package nu.marginalia.wmsa.edge.converting.interpreter.instruction;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag;
import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.util.Arrays;
public record LoadKeywords(EdgeUrl url, DocumentKeywords... words) implements Instruction {
public record LoadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) implements Instruction {
@Override
public void apply(Interpreter interpreter) {
interpreter.loadKeywords(url, words);
interpreter.loadKeywords(url, metadata, words);
}
@Override
public boolean isNoOp() {
return words.length == 0;
return false;
}
@Override
@ -26,7 +25,7 @@ public record LoadKeywords(EdgeUrl url, DocumentKeywords... words) implements In
@Override
public String toString() {
return getClass().getSimpleName()+"["+ Arrays.toString(words)+"]";
return getClass().getSimpleName()+"["+ words+"]";
}
}

View File

@ -5,6 +5,7 @@ import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient;
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import org.slf4j.Logger;
@ -19,7 +20,7 @@ public class IndexLoadKeywords implements Runnable {
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
private final EdgeIndexWriterClient client;
private record InsertTask(int urlId, int domainId, DocumentKeywords wordSet) {}
private record InsertTask(int urlId, int domainId, EdgePageDocumentsMetadata metadata, DocumentKeywords wordSet) {}
private final Thread runThread;
private volatile boolean canceled = false;
@ -38,7 +39,7 @@ public class IndexLoadKeywords implements Runnable {
while (!canceled) {
var data = insertQueue.poll(1, TimeUnit.SECONDS);
if (data != null) {
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index);
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet, index);
}
}
}
@ -48,7 +49,7 @@ public class IndexLoadKeywords implements Runnable {
runThread.join();
}
public void load(LoaderData loaderData, EdgeUrl url, DocumentKeywords[] words) throws InterruptedException {
public void load(LoaderData loaderData, EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) throws InterruptedException {
int domainId = loaderData.getDomainId(url.domain);
int urlId = loaderData.getUrlId(url);
@ -57,8 +58,6 @@ public class IndexLoadKeywords implements Runnable {
return;
}
for (var ws : words) {
insertQueue.put(new InsertTask(urlId, domainId, ws));
}
insertQueue.put(new InsertTask(urlId, domainId, metadata, words));
}
}

View File

@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
@ -108,8 +109,8 @@ public class Loader implements Interpreter {
}
@Override
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {
logger.debug("loadKeywords(#{})", words.length);
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) {
logger.debug("loadKeywords()");
// This is a bit of a bandaid safeguard against a bug in
// in the converter, shouldn't be necessary in the future
@ -124,7 +125,7 @@ public class Loader implements Interpreter {
}
try {
indexLoadKeywords.load(data, url, words);
indexLoadKeywords.load(data, url, metadata, words);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}

View File

@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.converting.model;
import lombok.ToString;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
import java.util.OptionalDouble;
@ -12,7 +12,7 @@ public class ProcessedDocument {
public EdgeUrl url;
public ProcessedDocumentDetails details;
public EdgePageWordSet words;
public EdgePageWords words;
public EdgeUrlState state;
public String stateReason;

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.converting.model;
import lombok.ToString;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
@ -27,4 +28,6 @@ public class ProcessedDocumentDetails {
public List<EdgeUrl> linksInternal;
public List<EdgeUrl> linksExternal;
public List<EdgeUrl> feedLinks;
public EdgePageDocumentsMetadata metadata;
}

Some files were not shown because too many files have changed in this diff Show More