2022-11 release (#133)
Co-authored-by: vlofgren <vlofgren@gmail.com> Co-authored-by: vlofgren <vlofgren@marginalia.nu> Co-authored-by: Viktor Lofgren <vlofgren@marginalia.nu> Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/133
This commit is contained in:
parent
06299cd554
commit
6b44786649
14
README.md
14
README.md
@ -12,10 +12,16 @@ The canonical git server for this project is [https://git.marginalia.nu](https:/
|
||||
It is fine to mirror it on other hosts, but if you have issues or questions
|
||||
git.marginalia.nu is where you want to go.
|
||||
|
||||
As it stands now, the project is still being set up and is a bit of a mess as
|
||||
it wasn't developed with the intention of going open source, a lot of tests
|
||||
and so on make assumptions about the directory structure, much configuration
|
||||
is hard coded and so on. Please stand by. A lot of the mess is fairly superficial.
|
||||
## Important note about wmsa.local
|
||||
|
||||
This project has a [sister repository called wmsa.local](https://git.marginalia.nu/marginalia/wmsa.local)
|
||||
that contains scripts and configuration files for running and developing the code.
|
||||
|
||||
Without it, development is very unpleasant.
|
||||
|
||||
While developing the code, you will want an environment variable WMSA_HOME pointing to
|
||||
the directory in which wmsa.local is checked out, otherwise the code will not run and
|
||||
several tests will fail.
|
||||
|
||||
## Documentation
|
||||
|
||||
|
12
build.gradle
12
build.gradle
@ -56,19 +56,7 @@ test {
|
||||
forkEvery = 1
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform {
|
||||
excludeTags "db"
|
||||
excludeTags "nobuild"
|
||||
}
|
||||
}
|
||||
|
||||
task dbTest(type: Test) {
|
||||
maxParallelForks = 1
|
||||
forkEvery = 1
|
||||
maxHeapSize = "8G"
|
||||
|
||||
useJUnitPlatform {
|
||||
includeTags "db"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
2
gradle/wrapper/gradle-wrapper.properties
vendored
2
gradle/wrapper/gradle-wrapper.properties
vendored
@ -1,5 +1,5 @@
|
||||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.5-bin.zip
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
||||
|
178
gradlew.bat
vendored
178
gradlew.bat
vendored
@ -1,89 +1,89 @@
|
||||
@rem
|
||||
@rem Copyright 2015 the original author or authors.
|
||||
@rem
|
||||
@rem Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@rem you may not use this file except in compliance with the License.
|
||||
@rem You may obtain a copy of the License at
|
||||
@rem
|
||||
@rem https://www.apache.org/licenses/LICENSE-2.0
|
||||
@rem
|
||||
@rem Unless required by applicable law or agreed to in writing, software
|
||||
@rem distributed under the License is distributed on an "AS IS" BASIS,
|
||||
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
@rem See the License for the specific language governing permissions and
|
||||
@rem limitations under the License.
|
||||
@rem
|
||||
|
||||
@if "%DEBUG%" == "" @echo off
|
||||
@rem ##########################################################################
|
||||
@rem
|
||||
@rem Gradle startup script for Windows
|
||||
@rem
|
||||
@rem ##########################################################################
|
||||
|
||||
@rem Set local scope for the variables with windows NT shell
|
||||
if "%OS%"=="Windows_NT" setlocal
|
||||
|
||||
set DIRNAME=%~dp0
|
||||
if "%DIRNAME%" == "" set DIRNAME=.
|
||||
set APP_BASE_NAME=%~n0
|
||||
set APP_HOME=%DIRNAME%
|
||||
|
||||
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
|
||||
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
|
||||
|
||||
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
||||
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
|
||||
|
||||
@rem Find java.exe
|
||||
if defined JAVA_HOME goto findJavaFromJavaHome
|
||||
|
||||
set JAVA_EXE=java.exe
|
||||
%JAVA_EXE% -version >NUL 2>&1
|
||||
if "%ERRORLEVEL%" == "0" goto execute
|
||||
|
||||
echo.
|
||||
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
||||
echo.
|
||||
echo Please set the JAVA_HOME variable in your environment to match the
|
||||
echo location of your Java installation.
|
||||
|
||||
goto fail
|
||||
|
||||
:findJavaFromJavaHome
|
||||
set JAVA_HOME=%JAVA_HOME:"=%
|
||||
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
||||
|
||||
if exist "%JAVA_EXE%" goto execute
|
||||
|
||||
echo.
|
||||
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
||||
echo.
|
||||
echo Please set the JAVA_HOME variable in your environment to match the
|
||||
echo location of your Java installation.
|
||||
|
||||
goto fail
|
||||
|
||||
:execute
|
||||
@rem Setup the command line
|
||||
|
||||
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
||||
|
||||
|
||||
@rem Execute Gradle
|
||||
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
|
||||
|
||||
:end
|
||||
@rem End local scope for the variables with windows NT shell
|
||||
if "%ERRORLEVEL%"=="0" goto mainEnd
|
||||
|
||||
:fail
|
||||
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
||||
rem the _cmd.exe /c_ return code!
|
||||
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
||||
exit /b 1
|
||||
|
||||
:mainEnd
|
||||
if "%OS%"=="Windows_NT" endlocal
|
||||
|
||||
:omega
|
||||
@rem
|
||||
@rem Copyright 2015 the original author or authors.
|
||||
@rem
|
||||
@rem Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@rem you may not use this file except in compliance with the License.
|
||||
@rem You may obtain a copy of the License at
|
||||
@rem
|
||||
@rem https://www.apache.org/licenses/LICENSE-2.0
|
||||
@rem
|
||||
@rem Unless required by applicable law or agreed to in writing, software
|
||||
@rem distributed under the License is distributed on an "AS IS" BASIS,
|
||||
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
@rem See the License for the specific language governing permissions and
|
||||
@rem limitations under the License.
|
||||
@rem
|
||||
|
||||
@if "%DEBUG%" == "" @echo off
|
||||
@rem ##########################################################################
|
||||
@rem
|
||||
@rem Gradle startup script for Windows
|
||||
@rem
|
||||
@rem ##########################################################################
|
||||
|
||||
@rem Set local scope for the variables with windows NT shell
|
||||
if "%OS%"=="Windows_NT" setlocal
|
||||
|
||||
set DIRNAME=%~dp0
|
||||
if "%DIRNAME%" == "" set DIRNAME=.
|
||||
set APP_BASE_NAME=%~n0
|
||||
set APP_HOME=%DIRNAME%
|
||||
|
||||
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
|
||||
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
|
||||
|
||||
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
||||
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
|
||||
|
||||
@rem Find java.exe
|
||||
if defined JAVA_HOME goto findJavaFromJavaHome
|
||||
|
||||
set JAVA_EXE=java.exe
|
||||
%JAVA_EXE% -version >NUL 2>&1
|
||||
if "%ERRORLEVEL%" == "0" goto execute
|
||||
|
||||
echo.
|
||||
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
||||
echo.
|
||||
echo Please set the JAVA_HOME variable in your environment to match the
|
||||
echo location of your Java installation.
|
||||
|
||||
goto fail
|
||||
|
||||
:findJavaFromJavaHome
|
||||
set JAVA_HOME=%JAVA_HOME:"=%
|
||||
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
||||
|
||||
if exist "%JAVA_EXE%" goto execute
|
||||
|
||||
echo.
|
||||
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
||||
echo.
|
||||
echo Please set the JAVA_HOME variable in your environment to match the
|
||||
echo location of your Java installation.
|
||||
|
||||
goto fail
|
||||
|
||||
:execute
|
||||
@rem Setup the command line
|
||||
|
||||
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
||||
|
||||
|
||||
@rem Execute Gradle
|
||||
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
|
||||
|
||||
:end
|
||||
@rem End local scope for the variables with windows NT shell
|
||||
if "%ERRORLEVEL%"=="0" goto mainEnd
|
||||
|
||||
:fail
|
||||
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
||||
rem the _cmd.exe /c_ return code!
|
||||
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
||||
exit /b 1
|
||||
|
||||
:mainEnd
|
||||
if "%OS%"=="Windows_NT" endlocal
|
||||
|
||||
:omega
|
||||
|
@ -4,6 +4,8 @@ plugins {
|
||||
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
repositories {
|
||||
@ -63,22 +65,20 @@ dependencies {
|
||||
implementation 'org.projectlombok:lombok:1.18.24'
|
||||
annotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||
|
||||
implementation 'com.github.jknack:handlebars:4.3.0'
|
||||
implementation 'com.github.jknack:handlebars:4.3.1'
|
||||
implementation 'com.github.jknack:handlebars-markdown:4.2.1'
|
||||
|
||||
implementation group: 'com.google.code.gson', name: 'gson', version: '2.9.0'
|
||||
implementation 'io.reactivex.rxjava3:rxjava:3.1.4'
|
||||
implementation 'io.reactivex.rxjava3:rxjava:3.1.5'
|
||||
implementation "com.sparkjava:spark-core:2.9.3"
|
||||
implementation 'com.opencsv:opencsv:5.6'
|
||||
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
|
||||
|
||||
implementation 'org.slf4j:slf4j-api:1.7.36'
|
||||
testImplementation 'org.slf4j:slf4j-jdk14:2.0.3'
|
||||
|
||||
implementation 'com.google.guava:guava:31.1-jre'
|
||||
implementation 'com.google.inject:guice:5.1.0'
|
||||
@ -89,19 +89,19 @@ dependencies {
|
||||
|
||||
implementation group: 'com.h2database', name: 'h2', version: '2.1.210'
|
||||
|
||||
implementation 'org.jsoup:jsoup:1.14.3'
|
||||
implementation 'org.jsoup:jsoup:1.15.3'
|
||||
implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2'
|
||||
|
||||
implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.4'
|
||||
implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.6'
|
||||
implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3'
|
||||
|
||||
implementation 'com.zaxxer:HikariCP:5.0.1'
|
||||
|
||||
implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
|
||||
implementation 'io.prometheus:simpleclient:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient_servlet:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient_hotspot:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient:0.16.0'
|
||||
implementation 'io.prometheus:simpleclient_servlet:0.16.0'
|
||||
implementation 'io.prometheus:simpleclient_httpserver:0.16.0'
|
||||
implementation 'io.prometheus:simpleclient_hotspot:0.16.0'
|
||||
implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3'
|
||||
|
||||
implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30'
|
||||
@ -114,7 +114,7 @@ dependencies {
|
||||
implementation 'org.imgscalr:imgscalr-lib:4.2'
|
||||
implementation 'org.jclarion:image4j:0.7'
|
||||
|
||||
implementation 'commons-net:commons-net:3.6'
|
||||
implementation 'commons-net:commons-net:3.8.0'
|
||||
implementation 'org.eclipse.jgit:org.eclipse.jgit:5.12.0.202106070339-r'
|
||||
implementation 'org.eclipse.jgit:org.eclipse.jgit.ssh.jsch:5.12.0.202106070339-r'
|
||||
implementation 'com.jcraft:jsch:0.1.55'
|
||||
@ -123,12 +123,14 @@ dependencies {
|
||||
implementation 'edu.stanford.nlp:stanford-corenlp:4.4.0'
|
||||
|
||||
implementation group: 'it.unimi.dsi', name: 'fastutil', version: '8.5.8'
|
||||
implementation 'org.roaringbitmap:RoaringBitmap:0.9.27'
|
||||
implementation 'org.roaringbitmap:RoaringBitmap:0.9.32'
|
||||
|
||||
implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29'
|
||||
|
||||
implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0'
|
||||
|
||||
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
|
||||
testImplementation 'org.mockito:mockito-junit-jupiter:4.5.1'
|
||||
testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
|
||||
testCompileOnly 'org.projectlombok:lombok:1.18.24'
|
||||
testImplementation 'org.projectlombok:lombok:1.18.24'
|
||||
@ -136,23 +138,23 @@ dependencies {
|
||||
|
||||
testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1'
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.2')
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.2'
|
||||
testImplementation "org.testcontainers:junit-jupiter:1.17.2"
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
|
||||
e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
|
||||
e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.9.0'
|
||||
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
|
||||
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
|
||||
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||
e2eTestImplementation 'org.testcontainers:nginx:1.17.3'
|
||||
e2eTestImplementation 'org.testcontainers:nginx:1.17.4'
|
||||
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
|
||||
e2eTestImplementation 'org.testcontainers:selenium:1.17.3'
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.2.1'
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.2.1'
|
||||
e2eTestImplementation 'org.testcontainers:selenium:1.17.4'
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.5.3'
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.5.3'
|
||||
|
||||
|
||||
implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
|
||||
implementation 'org.seleniumhq.selenium:selenium-java:4.3.0'
|
||||
implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.5.3'
|
||||
implementation 'org.seleniumhq.selenium:selenium-java:4.5.3'
|
||||
implementation 'org.sejda.imageio:webp-imageio:0.1.6'
|
||||
|
||||
jmh 'org.openjdk.jmh:jmh-core:1.35'
|
||||
@ -167,23 +169,17 @@ configurations {
|
||||
|
||||
}
|
||||
|
||||
|
||||
test {
|
||||
maxParallelForks = 16
|
||||
forkEvery = 1
|
||||
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform {
|
||||
excludeTags "db"
|
||||
}
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
task dbTest(type: Test) {
|
||||
maxParallelForks = 1
|
||||
forkEvery = 1
|
||||
task fastTests(type: Test) {
|
||||
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
|
||||
maxHeapSize = "8G"
|
||||
|
||||
useJUnitPlatform {
|
||||
includeTags "db"
|
||||
excludeTags "slow"
|
||||
}
|
||||
}
|
||||
|
||||
@ -243,9 +239,9 @@ task IP2LocationFile(type: Copy) {
|
||||
into outputDir
|
||||
}
|
||||
|
||||
task downloadTermFreqData(type: Copy) {
|
||||
// TODO: Need hosting for this file
|
||||
from '/var/lib/wmsa/model/tfreq-new-algo3.bin'
|
||||
into 'data/models/'
|
||||
task downloadTermFreqData(type: Download) {
|
||||
src 'https://downloads.marginalia.nu/model/tfreq-new-algo3.bin'
|
||||
dest file('data/models/tfreq-new-algo3.bin')
|
||||
overwrite false
|
||||
}
|
||||
|
||||
|
@ -70,4 +70,4 @@ dating dating
|
||||
EOF
|
||||
|
||||
echo "*** Starting $1"
|
||||
WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
|
||||
WMSA_HOME=${HOME} java -server -ea -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
|
@ -27,6 +27,7 @@ public class AndCardIntSet {
|
||||
public static AndCardIntSet of(RoaringBitmap bmap) {
|
||||
|
||||
TIntArrayList lst = new TIntArrayList(bmap.getCardinality());
|
||||
|
||||
lst.addAll(bmap.toArray());
|
||||
|
||||
return new AndCardIntSet(lst);
|
||||
@ -37,7 +38,7 @@ public class AndCardIntSet {
|
||||
backingList = list;
|
||||
hash = 0;
|
||||
|
||||
if (list.size() < 128) {
|
||||
if (list.size() < 32) {
|
||||
for (int v : list.toArray()) {
|
||||
int bit = hasher.hashInt(v).asInt() % 64;
|
||||
hash |= (1L << bit);
|
||||
@ -56,7 +57,7 @@ public class AndCardIntSet {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (backingList.size() < 128) {
|
||||
if (backingList.size() < 32) {
|
||||
int bit = hasher.hashInt(val).asInt() % 64;
|
||||
hash |= (1L << bit);
|
||||
}
|
||||
@ -81,10 +82,10 @@ public class AndCardIntSet {
|
||||
if (!testHash(a,b)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (a.getCardinality() + b.getCardinality() < 10) {
|
||||
return andLinearSmall(a, b);
|
||||
}
|
||||
//
|
||||
// if (a.getCardinality() + b.getCardinality() < 10) {
|
||||
// return andLinearSmall(a, b);
|
||||
// }
|
||||
|
||||
return andLinear(a,b);
|
||||
}
|
||||
|
@ -1,80 +0,0 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
public class ByteFolder {
|
||||
|
||||
public byte[] foldBytes(int p, int q) {
|
||||
|
||||
int pw = bitWidth(p);
|
||||
int qw = bitWidth(q);
|
||||
int qpw = qw + pw;
|
||||
|
||||
long qp = Integer.toUnsignedLong(q) << pw | Integer.toUnsignedLong(p);
|
||||
|
||||
int qpwBytes = ((qpw - 1) / Byte.SIZE) + 1;
|
||||
|
||||
byte[] bytes = new byte[qpwBytes + 1];
|
||||
bytes[0] = (byte) pw;
|
||||
for (int i = 1; i < bytes.length; i++) {
|
||||
bytes[i] = (byte) (qp >>> (qpwBytes - i) * Byte.SIZE & 0xff);
|
||||
}
|
||||
|
||||
return bytes;
|
||||
}
|
||||
|
||||
// Function such that (decodeBytes o foldBytes) = identity
|
||||
public static int[] decodeBytes(byte[] data) {
|
||||
int[] dest = new int[2];
|
||||
decodeBytes(data, data.length, dest);
|
||||
return dest;
|
||||
}
|
||||
|
||||
public static void decodeBytes(byte[] data, int length, int[] dest) {
|
||||
long val = 0;
|
||||
|
||||
for (int i = 1; i < length; i++) {
|
||||
val = (val << 8) | ((0xFF)&data[i]);
|
||||
}
|
||||
|
||||
dest[1] = (int)(val >>> data[0]);
|
||||
dest[0] = (int)(val & ~(dest[1]<<data[0]));
|
||||
}
|
||||
|
||||
private static int bitWidth(int q) {
|
||||
int v = Integer.numberOfLeadingZeros(q);
|
||||
if (v == 32) return 1;
|
||||
return 32-v;
|
||||
}
|
||||
|
||||
public static String byteBits(byte[] b) {
|
||||
return byteBits(b, b.length);
|
||||
}
|
||||
|
||||
public static String byteBits(byte[] b, int n) {
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (int j = 0; j < n;j++) {
|
||||
if (!s.toString().isBlank()) {
|
||||
s.append(":");
|
||||
}
|
||||
for (int i = 7; i >= 0; i--) {
|
||||
s.append((b[j] & (1L << i)) > 0 ? 1 : 0);
|
||||
}
|
||||
}
|
||||
return s.toString();
|
||||
}
|
||||
public static String intBits(int v) {
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (int i = 32; i >=0; i--) {
|
||||
s.append((v & (1L << i)) > 0 ? 1 : 0);
|
||||
}
|
||||
return s.toString();
|
||||
}
|
||||
public static String longBits(long v) {
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (int i = 64; i >=0; i--) {
|
||||
s.append((v & (1L << i)) > 0 ? 1 : 0);
|
||||
}
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -6,37 +6,32 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.ByteChannel;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** For managing random writes on SSDs
|
||||
*
|
||||
* See https://en.wikipedia.org/wiki/Write_amplification
|
||||
/** For managing random writes on SSDs.
|
||||
* Because SSDs do not deal well with random small writes,
|
||||
* see https://en.wikipedia.org/wiki/Write_amplification,
|
||||
* it is beneficial to pigeonhole the writes first
|
||||
* within the same general region
|
||||
* */
|
||||
public class RandomWriteFunnel implements AutoCloseable {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(RandomWriteFunnel.class);
|
||||
private final DataBin[] bins;
|
||||
|
||||
private final ArrayList<DataBin> bins;
|
||||
private final Path tempDir;
|
||||
private final int binSize;
|
||||
|
||||
public RandomWriteFunnel(Path tempDir, long size, int binSize) throws IOException {
|
||||
public RandomWriteFunnel(Path tempDir, int binSize) throws IOException {
|
||||
this.binSize = binSize;
|
||||
this.tempDir = tempDir;
|
||||
|
||||
if (size > 0) {
|
||||
int binCount = (int) (size / binSize + ((size % binSize) != 0L ? 1 : 0));
|
||||
bins = new DataBin[binCount];
|
||||
for (int i = 0; i < binCount; i++) {
|
||||
bins[i] = new DataBin(tempDir, (int)
|
||||
Math.min((size - (long)binSize * i), binSize));
|
||||
}
|
||||
}
|
||||
else {
|
||||
bins = new DataBin[0];
|
||||
}
|
||||
bins = new ArrayList<>();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@ -44,10 +39,21 @@ public class RandomWriteFunnel implements AutoCloseable {
|
||||
int bin = (int)(address / binSize);
|
||||
int offset = (int)(address%binSize);
|
||||
|
||||
bins[bin].put(offset, data);
|
||||
if (bin >= bins.size()) {
|
||||
grow(bin);
|
||||
}
|
||||
|
||||
bins.get(bin).put(offset, data);
|
||||
}
|
||||
|
||||
public void write(FileChannel o) throws IOException {
|
||||
@SneakyThrows
|
||||
private void grow(int bin) {
|
||||
while (bins.size() <= bin) {
|
||||
bins.add(new DataBin(tempDir, binSize));
|
||||
}
|
||||
}
|
||||
|
||||
public void write(ByteChannel o) throws IOException {
|
||||
ByteBuffer buffer = ByteBuffer.allocateDirect(binSize*8);
|
||||
|
||||
for (var bin : bins) {
|
||||
@ -67,7 +73,7 @@ public class RandomWriteFunnel implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
static class DataBin implements AutoCloseable {
|
||||
static class DataBin {
|
||||
private final ByteBuffer buffer;
|
||||
private final int size;
|
||||
private final FileChannel channel;
|
||||
@ -77,7 +83,7 @@ public class RandomWriteFunnel implements AutoCloseable {
|
||||
buffer = ByteBuffer.allocateDirect(360_000);
|
||||
this.size = size;
|
||||
file = Files.createTempFile(tempDir, "scatter-writer", ".dat").toFile();
|
||||
channel = new RandomAccessFile(file, "rw").getChannel();
|
||||
channel = (FileChannel) Files.newByteChannel(file.toPath(), StandardOpenOption.WRITE, StandardOpenOption.READ);
|
||||
}
|
||||
|
||||
void put(int address, long data) throws IOException {
|
||||
@ -133,7 +139,6 @@ public class RandomWriteFunnel implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
channel.close();
|
||||
file.delete();
|
||||
|
@ -0,0 +1,28 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
public class StringPool {
|
||||
private final HashMap<String, String> words;
|
||||
|
||||
public StringPool() {
|
||||
this.words = new HashMap<>(1000);
|
||||
}
|
||||
|
||||
public StringPool(int capacity) {
|
||||
words = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
public String internalize(String str) {
|
||||
final String ret = words.putIfAbsent(str, str);
|
||||
|
||||
if (null == ret)
|
||||
return str;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public void flush() {
|
||||
words.clear();
|
||||
}
|
||||
}
|
@ -0,0 +1,111 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public class TransformList<T> {
|
||||
private final List<T> backingList;
|
||||
|
||||
public TransformList(List<T> backingList) {
|
||||
this.backingList = backingList;
|
||||
}
|
||||
|
||||
public void transformEach(Consumer<Entity> consumer) {
|
||||
for (var iter = backingList.listIterator(); iter.hasNext(); ) {
|
||||
var entity = new Entity(iter.next());
|
||||
consumer.accept(entity);
|
||||
if (entity.action == Action.REPLACE) {
|
||||
iter.set(entity.value);
|
||||
}
|
||||
else if (entity.action == Action.REMOVE) {
|
||||
iter.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void transformEachPair(BiConsumer<Entity, Entity> consumer) {
|
||||
for (var iter = backingList.listIterator(); iter.hasNext(); ) {
|
||||
var firstEntity = new Entity(iter.next());
|
||||
if (!iter.hasNext()) break;
|
||||
var secondEntry = new Entity(backingList.get(iter.nextIndex()));
|
||||
|
||||
consumer.accept(firstEntity, secondEntry);
|
||||
if (firstEntity.action == Action.REPLACE) {
|
||||
iter.set(firstEntity.value);
|
||||
|
||||
if (secondEntry.action == Action.REPLACE) {
|
||||
backingList.set(iter.nextIndex(), secondEntry.value);
|
||||
}
|
||||
else if (secondEntry.action == Action.REMOVE) {
|
||||
iter.next();
|
||||
iter.remove();
|
||||
}
|
||||
}
|
||||
else if (firstEntity.action == Action.REMOVE) {
|
||||
if (secondEntry.action == Action.REPLACE) {
|
||||
backingList.set(iter.nextIndex(), secondEntry.value);
|
||||
}
|
||||
|
||||
iter.remove();
|
||||
|
||||
if (secondEntry.action == Action.REMOVE) {
|
||||
iter.next();
|
||||
iter.remove();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public void scan(Predicate<T> start, Predicate<T> end, Consumer<TransformList<T>> inbetween) {
|
||||
for (int i = 0; i < backingList.size(); i++) {
|
||||
if (start.test(backingList.get(i))) {
|
||||
for (int j = i + 1; j < backingList.size(); j++) {
|
||||
if (end.test(backingList.get(j))) {
|
||||
inbetween.accept(new TransformList<>(backingList.subList(i, j+1)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void scanAndTransform(Predicate<T> start, Predicate<T> end, Consumer<Entity> inbetweenConsumer) {
|
||||
scan(start, end, range -> range.transformEach(inbetweenConsumer));
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return backingList.size();
|
||||
}
|
||||
|
||||
public List<T> getBackingList() {
|
||||
return backingList;
|
||||
}
|
||||
|
||||
|
||||
public class Entity {
|
||||
public T value;
|
||||
private Action action;
|
||||
|
||||
Entity(T value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public void replace(T newValue) {
|
||||
action = Action.REPLACE;
|
||||
value = newValue;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
action = Action.REMOVE;
|
||||
}
|
||||
}
|
||||
|
||||
enum Action {
|
||||
NO_OP,
|
||||
REPLACE,
|
||||
REMOVE
|
||||
}
|
||||
}
|
@ -0,0 +1,64 @@
|
||||
package nu.marginalia.util.array;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.algo.IntArrayBase;
|
||||
import nu.marginalia.util.array.algo.IntArraySearch;
|
||||
import nu.marginalia.util.array.algo.IntArraySort;
|
||||
import nu.marginalia.util.array.algo.IntArrayTransformations;
|
||||
import nu.marginalia.util.array.delegate.ShiftedIntArray;
|
||||
import nu.marginalia.util.array.page.IntArrayPage;
|
||||
import nu.marginalia.util.array.page.PagingIntArray;
|
||||
import nu.marginalia.util.array.scheme.ArrayPartitioningScheme;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public interface IntArray extends IntArrayBase, IntArrayTransformations, IntArraySearch, IntArraySort {
|
||||
int WORD_SIZE = 4;
|
||||
|
||||
ArrayPartitioningScheme DEFAULT_PARTITIONING_SCHEME
|
||||
= ArrayPartitioningScheme.forPartitionSize(Integer.getInteger("wmsa.page-size",1<<30) / WORD_SIZE);
|
||||
|
||||
int MAX_CONTINUOUS_SIZE = Integer.MAX_VALUE/WORD_SIZE - 16;
|
||||
|
||||
static IntArray allocate(long size) {
|
||||
if (size < MAX_CONTINUOUS_SIZE) {
|
||||
return IntArrayPage.onHeap((int) size);
|
||||
}
|
||||
|
||||
return PagingIntArray.newOnHeap(DEFAULT_PARTITIONING_SCHEME, size);
|
||||
}
|
||||
|
||||
static IntArray mmapRead(Path path) throws IOException {
|
||||
long sizeBytes = Files.size(path);
|
||||
|
||||
if (sizeBytes < MAX_CONTINUOUS_SIZE) {
|
||||
return IntArrayPage.fromMmapReadOnly(path, 0, (int) sizeBytes / 4);
|
||||
}
|
||||
|
||||
return PagingIntArray.mapFileReadOnly(DEFAULT_PARTITIONING_SCHEME, path);
|
||||
}
|
||||
|
||||
static IntArray mmapForWriting(Path path) throws IOException {
|
||||
return PagingIntArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path);
|
||||
}
|
||||
|
||||
static IntArray mmapForWriting(Path path, long size) throws IOException {
|
||||
return PagingIntArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path, size);
|
||||
}
|
||||
|
||||
default ShiftedIntArray shifted(long offset) {
|
||||
return new ShiftedIntArray(offset, this);
|
||||
}
|
||||
default ShiftedIntArray range(long start, long end) {
|
||||
return new ShiftedIntArray(start, end, this);
|
||||
}
|
||||
|
||||
void force();
|
||||
|
||||
|
||||
void advice(NativeIO.Advice advice) throws IOException;
|
||||
void advice(NativeIO.Advice advice, long start, long end) throws IOException;
|
||||
|
||||
}
|
@ -0,0 +1,63 @@
|
||||
package nu.marginalia.util.array;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.algo.LongArrayBase;
|
||||
import nu.marginalia.util.array.algo.LongArraySearch;
|
||||
import nu.marginalia.util.array.algo.LongArraySort;
|
||||
import nu.marginalia.util.array.algo.LongArrayTransformations;
|
||||
import nu.marginalia.util.array.delegate.ShiftedLongArray;
|
||||
import nu.marginalia.util.array.page.LongArrayPage;
|
||||
import nu.marginalia.util.array.page.PagingLongArray;
|
||||
import nu.marginalia.util.array.scheme.ArrayPartitioningScheme;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
|
||||
public interface LongArray extends LongArrayBase, LongArrayTransformations, LongArraySearch, LongArraySort {
|
||||
int WORD_SIZE = 8;
|
||||
|
||||
ArrayPartitioningScheme DEFAULT_PARTITIONING_SCHEME
|
||||
= ArrayPartitioningScheme.forPartitionSize(Integer.getInteger("wmsa.page-size",1<<30) / WORD_SIZE);
|
||||
|
||||
int MAX_CONTINUOUS_SIZE = Integer.MAX_VALUE/WORD_SIZE - 8;
|
||||
|
||||
static LongArray allocate(long size) {
|
||||
if (size < MAX_CONTINUOUS_SIZE) {
|
||||
return LongArrayPage.onHeap((int) size);
|
||||
}
|
||||
|
||||
return PagingLongArray.newOnHeap(DEFAULT_PARTITIONING_SCHEME, size);
|
||||
}
|
||||
|
||||
static LongArray mmapRead(Path path) throws IOException {
|
||||
long sizeBytes = Files.size(path);
|
||||
|
||||
if (sizeBytes < MAX_CONTINUOUS_SIZE) {
|
||||
return LongArrayPage.fromMmapReadOnly(path, 0, (int) sizeBytes / 8);
|
||||
}
|
||||
|
||||
return PagingLongArray.mapFileReadOnly(DEFAULT_PARTITIONING_SCHEME, path);
|
||||
}
|
||||
|
||||
static LongArray mmapForWriting(Path path) throws IOException {
|
||||
return PagingLongArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path);
|
||||
}
|
||||
|
||||
static LongArray mmapForWriting(Path path, long size) throws IOException {
|
||||
return PagingLongArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path, size);
|
||||
}
|
||||
|
||||
default ShiftedLongArray shifted(long offset) {
|
||||
return new ShiftedLongArray(offset, this);
|
||||
}
|
||||
default ShiftedLongArray range(long start, long end) {
|
||||
return new ShiftedLongArray(start, end, this);
|
||||
}
|
||||
|
||||
void force();
|
||||
|
||||
void advice(NativeIO.Advice advice) throws IOException;
|
||||
void advice(NativeIO.Advice advice, long start, long end) throws IOException;
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
package nu.marginalia.util.array.algo;
|
||||
|
||||
public interface BulkTransferArray<BufferType> {
|
||||
|
||||
void set(long start, long end, BufferType buffer, int bufferStart);
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
package nu.marginalia.util.array.algo;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public interface IntArrayBase extends BulkTransferArray<IntBuffer> {
|
||||
int get(long pos);
|
||||
|
||||
void set(long pos, int value);
|
||||
|
||||
long size();
|
||||
|
||||
default void fill(long start, long end, int val) {
|
||||
for (long v = start; v < end; v++) {
|
||||
set(v, val);
|
||||
}
|
||||
}
|
||||
|
||||
default void increment(long pos) {
|
||||
set(pos, get(pos) + 1);
|
||||
}
|
||||
|
||||
default void swap(long pos1, long pos2) {
|
||||
int tmp = get(pos1);
|
||||
set(pos1, get(pos2));
|
||||
set(pos2, tmp);
|
||||
}
|
||||
|
||||
default void swapn(int n, long pos1, long pos2) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
int tmp = get(pos1+i);
|
||||
set(pos1+i, get(pos2+i));
|
||||
set(pos2+i, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
default int getAndIncrement(long pos) {
|
||||
int val = get(pos);
|
||||
set(pos, val + 1);
|
||||
return val;
|
||||
}
|
||||
|
||||
default void set(long start, long end, IntBuffer buffer, int bufferStart) {
|
||||
for (int i = 0; i < (end-start); i++) {
|
||||
set(start+i, buffer.get(i + bufferStart));
|
||||
}
|
||||
}
|
||||
default void get(long start, long end, IntBuffer buffer, int bufferStart) {
|
||||
for (int i = 0; i < (end-start); i++) {
|
||||
buffer.put(i + bufferStart, get(start + i));
|
||||
}
|
||||
}
|
||||
|
||||
default void get(long start, IntBuffer buffer) {
|
||||
get(start, start + buffer.remaining(), buffer, buffer.position());
|
||||
}
|
||||
|
||||
default void get(long start, long end, int[] buffer) {
|
||||
for (int i = 0; i < (end-start); i++) {
|
||||
buffer[i] = get(start + i);
|
||||
}
|
||||
}
|
||||
|
||||
void write(Path file) throws IOException;
|
||||
|
||||
void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
|
||||
}
|
@ -0,0 +1,126 @@
|
||||
package nu.marginalia.util.array.algo;
|
||||
|
||||
import nu.marginalia.util.array.buffer.IntQueryBuffer;
|
||||
|
||||
import static nu.marginalia.util.array.algo.LongArraySearch.encodeSearchMiss;
|
||||
|
||||
public interface IntArraySearch extends IntArrayBase {
|
||||
|
||||
int LINEAR_SEARCH_CUTOFF = 64;
|
||||
|
||||
default long linearSearch(int key, long fromIndex, long toIndex) {
|
||||
long pos;
|
||||
|
||||
for (pos = fromIndex; pos < toIndex; pos++) {
|
||||
int val = get(pos);
|
||||
|
||||
if (val == key) return pos;
|
||||
if (val > key) break;
|
||||
}
|
||||
|
||||
return encodeSearchMiss(pos - 1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
default long binarySearch(int key, long fromIndex, long toIndex) {
|
||||
long low = 0;
|
||||
long high = (toIndex - fromIndex) - 1;
|
||||
|
||||
while (high - low >= LINEAR_SEARCH_CUTOFF) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return linearSearch(key, fromIndex + low, fromIndex + high + 1);
|
||||
}
|
||||
|
||||
default long binarySearchUpperBound(int key, long fromIndex, long toIndex) {
|
||||
long low = 0;
|
||||
long high = (toIndex - fromIndex) - 1;
|
||||
|
||||
while (high - low >= LINEAR_SEARCH_CUTOFF) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
|
||||
for (fromIndex += low; fromIndex < toIndex; fromIndex++) {
|
||||
if (get(fromIndex) >= key) return fromIndex;
|
||||
}
|
||||
|
||||
return toIndex;
|
||||
}
|
||||
|
||||
|
||||
default void retain(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
|
||||
if (searchStart >= searchEnd) return;
|
||||
|
||||
int bv = buffer.currentValue();
|
||||
int av = get(searchStart);
|
||||
long pos = searchStart;
|
||||
|
||||
while (bv <= boundary && buffer.hasMore()) {
|
||||
if (bv < av) {
|
||||
if (!buffer.rejectAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
else if (bv == av) {
|
||||
if (!buffer.retainAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (++pos < searchEnd) {
|
||||
av = get(pos);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
default void reject(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
|
||||
if (searchStart >= searchEnd) return;
|
||||
|
||||
int bv = buffer.currentValue();
|
||||
int av = get(searchStart);
|
||||
long pos = searchStart;
|
||||
|
||||
while (bv <= boundary && buffer.hasMore()) {
|
||||
if (bv < av) {
|
||||
if (!buffer.retainAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
else if (bv == av) {
|
||||
if (!buffer.rejectAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (++pos < searchEnd) {
|
||||
av = get(pos);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,174 @@
|
||||
package nu.marginalia.util.array.algo;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public interface IntArraySort extends IntArrayBase {
|
||||
|
||||
default boolean isSorted(long start, long end) {
|
||||
if (start == end) return true;
|
||||
|
||||
int val = get(start);
|
||||
for (long i = start + 1; i < end; i++) {
|
||||
int next = get(i);
|
||||
if (next < val)
|
||||
return false;
|
||||
val = next;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
default void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
|
||||
long size = end - start;
|
||||
|
||||
if (size < ctx.memorySortLimit()) {
|
||||
quickSort(start, end);
|
||||
}
|
||||
else {
|
||||
mergeSort(start, end, ctx.tempDir());
|
||||
}
|
||||
}
|
||||
|
||||
default boolean isSortedN(int wordSize, long start, long end) {
|
||||
if (start == end) return true;
|
||||
|
||||
int val = get(start);
|
||||
for (long i = start + wordSize; i < end; i+=wordSize) {
|
||||
int next = get(i);
|
||||
if (next < val)
|
||||
return false;
|
||||
val = next;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
default void insertionSort(long start, long end) {
|
||||
assert end - start < Integer.MAX_VALUE;
|
||||
|
||||
int n = (int) (end - start);
|
||||
|
||||
if (n <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 1; i < n; i++) {
|
||||
int key = get(start + i);
|
||||
|
||||
int j = i - 1;
|
||||
while (j >= 0 && get(start + j) > key) {
|
||||
swap( start + j, start + (long)(j+1));
|
||||
j--;
|
||||
}
|
||||
set(start + j+1, key);
|
||||
}
|
||||
}
|
||||
|
||||
default void quickSort(long start, long end) {
|
||||
if (end - start < 64) {
|
||||
insertionSort(start, end);
|
||||
}
|
||||
else {
|
||||
_quickSortLH(start, end - 1);
|
||||
}
|
||||
}
|
||||
|
||||
default void _quickSortLH(long low, long highInclusive) {
|
||||
|
||||
if (low < 0 || highInclusive < 0 || low >= highInclusive)
|
||||
return;
|
||||
|
||||
if (highInclusive - low < 32) {
|
||||
insertionSort(low, highInclusive + 1);
|
||||
return;
|
||||
}
|
||||
|
||||
long p = _quickSortPartition(low, highInclusive);
|
||||
|
||||
_quickSortLH(low, p);
|
||||
_quickSortLH(p + 1, highInclusive);
|
||||
}
|
||||
|
||||
|
||||
default long _quickSortPartition(long low, long high) {
|
||||
|
||||
long pivotPoint = ((low + high) / (2L));
|
||||
int pivot = get(pivotPoint);
|
||||
|
||||
long i = low - 1;
|
||||
long j = high + 1;
|
||||
|
||||
for (;;) {
|
||||
do {
|
||||
i+=1;
|
||||
} while (get(i) < pivot);
|
||||
|
||||
do {
|
||||
j-=1;
|
||||
}
|
||||
while (get(j) > pivot);
|
||||
|
||||
if (i >= j) return j;
|
||||
else swap(i, j);
|
||||
}
|
||||
}
|
||||
|
||||
default void mergeSort(long start, long end, Path tmpDir) throws IOException {
|
||||
int length = (int) (end - start);
|
||||
|
||||
Path tmpFile = Files.createTempFile(tmpDir,"sort-"+start+"-"+(start+length), ".dat");
|
||||
try (var channel = (FileChannel) Files.newByteChannel(tmpFile, StandardOpenOption.WRITE, StandardOpenOption.READ)) {
|
||||
var workBuffer = channel.map(FileChannel.MapMode.READ_WRITE, 0, 4L * length).asIntBuffer();
|
||||
|
||||
_mergeSort(start, length, workBuffer);
|
||||
}
|
||||
finally {
|
||||
Files.delete(tmpFile);
|
||||
}
|
||||
}
|
||||
|
||||
default void _mergeSort(long start, int length, IntBuffer workBuffer) {
|
||||
int width = Math.min(Integer.highestOneBit(length), 1 << 16);
|
||||
|
||||
// Do in-memory sorting up until internalSortLimit first
|
||||
for (int i = 0; i < length; i += width) {
|
||||
quickSort(start + i, start + i + Math.min(width, length-i));
|
||||
}
|
||||
|
||||
// Then finish with merge sort
|
||||
for (width = 1; width < length; width*=2) {
|
||||
|
||||
for (int i = 0; i < length; i += 2*width) {
|
||||
_merge(start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer);
|
||||
}
|
||||
|
||||
workBuffer.clear();
|
||||
set(start, start + length, workBuffer, 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
default void _merge(long offset, int left, int right, int end, IntBuffer workBuffer) {
|
||||
long idxL = left;
|
||||
long idxR = right;
|
||||
|
||||
for (int putPos = left; putPos < end; putPos++) {
|
||||
if (idxL < right && (idxR >= end || get(offset+idxL) < get(offset+idxR))) {
|
||||
workBuffer.put(putPos, get(offset+idxL));
|
||||
idxL++;
|
||||
}
|
||||
else {
|
||||
workBuffer.put(putPos, get(offset+idxR));
|
||||
idxR++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.util.array.algo;
|
||||
|
||||
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
|
||||
import nu.marginalia.util.array.functional.IntIOTransformer;
|
||||
import nu.marginalia.util.array.functional.IntTransformer;
|
||||
import nu.marginalia.util.array.functional.LongIntConsumer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface IntArrayTransformations extends IntArrayBase {
|
||||
|
||||
default void forEach(long start, long end, LongIntConsumer consumer) {
|
||||
for (long i = start; i < end; i++) {
|
||||
consumer.accept(i, get(i));
|
||||
}
|
||||
}
|
||||
|
||||
default void transformEach(long start, long end, IntTransformer transformer) {
|
||||
for (long i = start; i < end; i++) {
|
||||
set(i, transformer.transform(i, get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
default void transformEachIO(long start, long end, IntIOTransformer transformer) throws IOException {
|
||||
for (long i = start; i < end; i++) {
|
||||
set(i, transformer.transform(i, get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
default int foldIO(int zero, long start, long end, IntBinaryIOOperation operator) throws IOException {
|
||||
int accumulator = zero;
|
||||
|
||||
for (long i = start; i < end; i++) {
|
||||
accumulator = operator.apply(accumulator, get(i));
|
||||
}
|
||||
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
package nu.marginalia.util.array.algo;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
|
||||
long get(long pos);
|
||||
|
||||
void set(long pos, long value);
|
||||
|
||||
long size();
|
||||
|
||||
default void fill(long start, long end, long val) {
|
||||
for (long v = start; v < end; v++) {
|
||||
set(v, val);
|
||||
}
|
||||
}
|
||||
|
||||
default void increment(long pos) {
|
||||
set(pos, get(pos) + 1);
|
||||
}
|
||||
|
||||
default void swap(long pos1, long pos2) {
|
||||
long tmp = get(pos1);
|
||||
set(pos1, get(pos2));
|
||||
set(pos2, tmp);
|
||||
}
|
||||
|
||||
default void swapn(int n, long pos1, long pos2) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
long tmp = get(pos1+i);
|
||||
set(pos1+i, get(pos2+i));
|
||||
set(pos2+i, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
default long getAndIncrement(long pos) {
|
||||
long val = get(pos);
|
||||
set(pos, val + 1);
|
||||
return val;
|
||||
}
|
||||
|
||||
default void set(long start, long end, LongBuffer buffer, int bufferStart) {
|
||||
for (int i = 0; i < (end-start); i++) {
|
||||
set(start+i, buffer.get(i + bufferStart));
|
||||
}
|
||||
}
|
||||
default void get(long start, long end, LongBuffer buffer, int bufferStart) {
|
||||
for (int i = 0; i < (end-start); i++) {
|
||||
buffer.put(i + bufferStart, get(start + i));
|
||||
}
|
||||
}
|
||||
|
||||
default void get(long start, LongBuffer buffer) {
|
||||
get(start, start + buffer.remaining(), buffer, buffer.position());
|
||||
}
|
||||
|
||||
default void get(long start, long end, long[] buffer) {
|
||||
for (long i = 0; i < (end-start); i++) {
|
||||
buffer[(int) i] = get(start + i);
|
||||
}
|
||||
}
|
||||
|
||||
void write(Path file) throws IOException;
|
||||
|
||||
void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
|
||||
}
|
@ -0,0 +1,263 @@
|
||||
package nu.marginalia.util.array.algo;
|
||||
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
|
||||
public interface LongArraySearch extends LongArrayBase {
|
||||
|
||||
int LINEAR_SEARCH_CUTOFF = 32;
|
||||
|
||||
default long linearSearch(long key, long fromIndex, long toIndex) {
|
||||
long pos;
|
||||
|
||||
for (pos = fromIndex; pos < toIndex; pos++) {
|
||||
long val = get(pos);
|
||||
|
||||
if (val == key) return pos;
|
||||
if (val > key) break;
|
||||
}
|
||||
|
||||
return encodeSearchMiss(pos - 1);
|
||||
}
|
||||
|
||||
default long linearSearchUpperBound(long key, long fromIndex, long toIndex) {
|
||||
|
||||
for (long pos = fromIndex; pos < toIndex; pos++) {
|
||||
if (get(pos) >= key) return pos;
|
||||
}
|
||||
|
||||
return toIndex;
|
||||
}
|
||||
|
||||
default long linearSearchN(int sz, long key, long fromIndex, long toIndex) {
|
||||
long pos;
|
||||
|
||||
for (pos = fromIndex; pos < toIndex; pos+=sz) {
|
||||
long val = get(pos);
|
||||
|
||||
if (val == key) return pos;
|
||||
if (val > key) return encodeSearchMiss(pos);
|
||||
}
|
||||
|
||||
return encodeSearchMiss(toIndex - sz);
|
||||
}
|
||||
|
||||
default long binarySearch(long key, long fromIndex, long toIndex) {
|
||||
long low = 0;
|
||||
long high = (toIndex - fromIndex) - 1;
|
||||
|
||||
while (high - low >= LINEAR_SEARCH_CUTOFF) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
|
||||
return linearSearch(key, fromIndex + low, fromIndex + high + 1);
|
||||
}
|
||||
|
||||
default long binarySearchN(int sz, long key, long fromIndex, long toIndex) {
|
||||
long low = 0;
|
||||
long high = (toIndex - fromIndex)/sz - 1;
|
||||
|
||||
while (high - low >= LINEAR_SEARCH_CUTOFF) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + sz*mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + sz*mid;
|
||||
}
|
||||
|
||||
for (fromIndex += low*sz; fromIndex < toIndex; fromIndex+=sz) {
|
||||
long val = get(fromIndex);
|
||||
|
||||
if (val == key) return fromIndex;
|
||||
if (val > key) return encodeSearchMiss(fromIndex);
|
||||
}
|
||||
|
||||
return encodeSearchMiss(toIndex - sz);
|
||||
}
|
||||
|
||||
|
||||
default long binarySearchUpperBound(long key, long fromIndex, long toIndex) {
|
||||
long low = 0;
|
||||
long high = (toIndex - fromIndex) - 1;
|
||||
|
||||
while (high - low >= LINEAR_SEARCH_CUTOFF) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
|
||||
for (fromIndex += low; fromIndex < toIndex; fromIndex++) {
|
||||
if (get(fromIndex) >= key) return fromIndex;
|
||||
}
|
||||
|
||||
return toIndex;
|
||||
}
|
||||
|
||||
default long binarySearchUpperBoundN(int sz, long key, long fromIndex, long toIndex) {
|
||||
long low = 0;
|
||||
long high = (toIndex - fromIndex)/sz - 1;
|
||||
|
||||
while (high - low >= LINEAR_SEARCH_CUTOFF) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + sz*mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + sz*mid;
|
||||
}
|
||||
|
||||
for (fromIndex += low; fromIndex < toIndex; fromIndex+=sz) {
|
||||
if (get(fromIndex) >= key) return fromIndex;
|
||||
}
|
||||
|
||||
return toIndex;
|
||||
}
|
||||
|
||||
default void retain(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
|
||||
if (searchStart >= searchEnd) return;
|
||||
|
||||
long bv = buffer.currentValue();
|
||||
long av = get(searchStart);
|
||||
long pos = searchStart;
|
||||
|
||||
while (bv <= boundary && buffer.hasMore()) {
|
||||
if (bv < av) {
|
||||
if (!buffer.rejectAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
else if (bv == av) {
|
||||
if (!buffer.retainAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (++pos < searchEnd) {
|
||||
av = get(pos);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
default void retainN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
|
||||
|
||||
if (searchStart >= searchEnd) return;
|
||||
|
||||
long bv = buffer.currentValue();
|
||||
long av = get(searchStart);
|
||||
long pos = searchStart;
|
||||
|
||||
while (bv <= boundary && buffer.hasMore()) {
|
||||
if (bv < av) {
|
||||
if (!buffer.rejectAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
else if (bv == av) {
|
||||
if (!buffer.retainAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
|
||||
pos += sz;
|
||||
|
||||
if (pos < searchEnd) {
|
||||
av = get(pos);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
default void reject(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
|
||||
if (searchStart >= searchEnd) return;
|
||||
|
||||
long bv = buffer.currentValue();
|
||||
long av = get(searchStart);
|
||||
long pos = searchStart;
|
||||
|
||||
while (bv <= boundary && buffer.hasMore()) {
|
||||
if (bv < av) {
|
||||
if (!buffer.retainAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
else if (bv == av) {
|
||||
if (!buffer.rejectAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (++pos < searchEnd) {
|
||||
av = get(pos);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
default void rejectN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
|
||||
|
||||
if (searchStart >= searchEnd) return;
|
||||
|
||||
long bv = buffer.currentValue();
|
||||
long av = get(searchStart);
|
||||
long pos = searchStart;
|
||||
|
||||
while (bv <= boundary && buffer.hasMore()) {
|
||||
if (bv < av) {
|
||||
if (!buffer.retainAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
else if (bv == av) {
|
||||
if (!buffer.rejectAndAdvance()) break;
|
||||
bv = buffer.currentValue();
|
||||
continue;
|
||||
}
|
||||
|
||||
pos += sz;
|
||||
if (pos < searchEnd) {
|
||||
av = get(pos);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static long encodeSearchMiss(long value) {
|
||||
return -1 - value;
|
||||
}
|
||||
|
||||
static long decodeSearchMiss(long value) {
|
||||
return -value - 1;
|
||||
}
|
||||
}
|
@ -0,0 +1,325 @@
|
||||
package nu.marginalia.util.array.algo;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public interface LongArraySort extends LongArrayBase {
|
||||
|
||||
default boolean isSorted(long start, long end) {
|
||||
if (start == end) return true;
|
||||
|
||||
long val = get(start);
|
||||
for (long i = start + 1; i < end; i++) {
|
||||
long next = get(i);
|
||||
if (next < val)
|
||||
return false;
|
||||
val = next;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
default void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
|
||||
long size = end - start;
|
||||
|
||||
if (size < ctx.memorySortLimit()) {
|
||||
quickSort(start, end);
|
||||
}
|
||||
else {
|
||||
mergeSort(start, end, ctx.tempDir());
|
||||
}
|
||||
}
|
||||
|
||||
default void sortLargeSpanN(SortingContext ctx, int sz, long start, long end) throws IOException {
|
||||
if (sz == 1) {
|
||||
sortLargeSpan(ctx, start, end);
|
||||
return;
|
||||
}
|
||||
|
||||
long size = end - start;
|
||||
|
||||
if (size < ctx.memorySortLimit()) {
|
||||
quickSortN(sz, start, end);
|
||||
}
|
||||
else {
|
||||
mergeSortN(sz, start, end, ctx.tempDir());
|
||||
}
|
||||
}
|
||||
|
||||
default boolean isSortedN(int wordSize, long start, long end) {
|
||||
if (start == end) return true;
|
||||
|
||||
long val = get(start);
|
||||
for (long i = start + wordSize; i < end; i+=wordSize) {
|
||||
long next = get(i);
|
||||
if (next < val)
|
||||
return false;
|
||||
val = next;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
default void insertionSort(long start, long end) {
|
||||
assert end - start < Integer.MAX_VALUE;
|
||||
|
||||
int n = (int) (end - start);
|
||||
|
||||
if (n <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 1; i < n; i++) {
|
||||
long key = get(start + i);
|
||||
|
||||
int j = i - 1;
|
||||
while (j >= 0 && get(start + j) > key) {
|
||||
swap( start + j, start + (long)(j+1));
|
||||
j--;
|
||||
}
|
||||
set(start + j+1, key);
|
||||
}
|
||||
}
|
||||
|
||||
default void insertionSortN(int sz, long start, long end) {
|
||||
assert end - start < Integer.MAX_VALUE;
|
||||
|
||||
int span = (int) (end - start);
|
||||
|
||||
assert (span % sz) == 0;
|
||||
|
||||
if (span <= sz) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 1; i < span / sz; i++) {
|
||||
long key = get(start + (long) i * sz);
|
||||
|
||||
int j = i - 1;
|
||||
while (j >= 0 && get(start + (long)sz*j) > key) {
|
||||
swapn(sz, start + (long)sz*j, start + (long)sz*(j+1));
|
||||
j--;
|
||||
}
|
||||
set(start + (long) (j+1) * sz, key);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
default void quickSort(long start, long end) {
|
||||
if (end - start < 64) {
|
||||
insertionSort(start, end);
|
||||
}
|
||||
else {
|
||||
_quickSortLH(start, end - 1);
|
||||
}
|
||||
}
|
||||
|
||||
default void quickSortN(int wordSize, long start, long end) {
|
||||
assert ((end - start) % wordSize) == 0;
|
||||
|
||||
if (end == start)
|
||||
return;
|
||||
|
||||
_quickSortLHN(wordSize, start, end - wordSize);
|
||||
}
|
||||
|
||||
default void _quickSortLHN(int wordSize, long low, long highInclusive) {
|
||||
if (low < 0 || highInclusive < 0 || low >= highInclusive)
|
||||
return;
|
||||
|
||||
if (highInclusive - low < 32L*wordSize) {
|
||||
insertionSortN(wordSize, low, highInclusive + wordSize);
|
||||
return;
|
||||
}
|
||||
|
||||
long p = _quickSortPartitionN(wordSize, low, highInclusive);
|
||||
|
||||
_quickSortLHN(wordSize, low, p);
|
||||
_quickSortLHN(wordSize, p + wordSize, highInclusive);
|
||||
}
|
||||
|
||||
|
||||
default void _quickSortLH(long low, long highInclusive) {
|
||||
|
||||
if (low < 0 || highInclusive < 0 || low >= highInclusive)
|
||||
return;
|
||||
|
||||
if (highInclusive - low < 32) {
|
||||
insertionSort(low, highInclusive + 1);
|
||||
return;
|
||||
}
|
||||
|
||||
long p = _quickSortPartition(low, highInclusive);
|
||||
|
||||
_quickSortLH(low, p);
|
||||
_quickSortLH(p + 1, highInclusive);
|
||||
}
|
||||
|
||||
|
||||
default long _quickSortPartition(long low, long high) {
|
||||
|
||||
long pivotPoint = ((low + high) / (2L));
|
||||
long pivot = get(pivotPoint);
|
||||
|
||||
long i = low - 1;
|
||||
long j = high + 1;
|
||||
|
||||
for (;;) {
|
||||
do {
|
||||
i+=1;
|
||||
} while (get(i) < pivot);
|
||||
|
||||
do {
|
||||
j-=1;
|
||||
}
|
||||
while (get(j) > pivot);
|
||||
|
||||
if (i >= j) return j;
|
||||
else swap(i, j);
|
||||
}
|
||||
}
|
||||
|
||||
default long _quickSortPartitionN(int wordSize, long low, long high) {
|
||||
|
||||
long pivotPoint = ((low + high) / (2L*wordSize)) * wordSize;
|
||||
long pivot = get(pivotPoint);
|
||||
|
||||
long i = low - wordSize;
|
||||
long j = high + wordSize;
|
||||
|
||||
for (;;) {
|
||||
do {
|
||||
i+=wordSize;
|
||||
}
|
||||
while (get(i) < pivot);
|
||||
|
||||
do {
|
||||
j-=wordSize;
|
||||
}
|
||||
while (get(j) > pivot);
|
||||
|
||||
if (i >= j) return j;
|
||||
else swapn(wordSize, i, j);
|
||||
}
|
||||
}
|
||||
|
||||
default void _mergeSortN(int wordSize, long start, int length, LongBuffer workBuffer) throws IOException {
|
||||
int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(workBuffer.capacity()));
|
||||
|
||||
// Do in-memory sorting up until internalSortLimit first
|
||||
for (int i = 0; i < length; i += width) {
|
||||
quickSortN(wordSize, start + i, start + i + Math.min(width, length-i));
|
||||
}
|
||||
|
||||
// Then finish with merge sort
|
||||
for (; width < length; width*=2) {
|
||||
|
||||
for (int i = 0; i < length; i += 2*width) {
|
||||
_mergeN(wordSize, start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer);
|
||||
}
|
||||
|
||||
workBuffer.clear();
|
||||
set(start, start + length, workBuffer, 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
default void mergeSortN(int wordSize, long start, long end, Path tmpDir) throws IOException {
|
||||
int length = (int) (end - start);
|
||||
assert (length % wordSize) == 0;
|
||||
|
||||
Path tmpFile = Files.createTempFile(tmpDir,"sort-"+start+"-"+(start+length), ".dat");
|
||||
try (var channel = (FileChannel) Files.newByteChannel(tmpFile, StandardOpenOption.WRITE, StandardOpenOption.READ)) {
|
||||
var workBuffer = channel.map(FileChannel.MapMode.READ_WRITE, 0, 8L * length).asLongBuffer();
|
||||
|
||||
_mergeSortN(wordSize, start, length, workBuffer);
|
||||
}
|
||||
finally {
|
||||
Files.delete(tmpFile);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
default void mergeSort(long start, long end, Path tmpDir) throws IOException {
|
||||
int length = (int) (end - start);
|
||||
|
||||
Path tmpFile = Files.createTempFile(tmpDir,"sort-"+start+"-"+(start+length), ".dat");
|
||||
try (var channel = (FileChannel) Files.newByteChannel(tmpFile, StandardOpenOption.WRITE, StandardOpenOption.READ)) {
|
||||
var workBuffer = channel.map(FileChannel.MapMode.READ_WRITE, 0, 8L * length).asLongBuffer();
|
||||
|
||||
_mergeSort(start, length, workBuffer);
|
||||
}
|
||||
finally {
|
||||
Files.delete(tmpFile);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
default void _mergeSort(long start, int length, LongBuffer workBuffer) {
|
||||
int width = Math.min(Integer.highestOneBit(length), 1 << 16);
|
||||
|
||||
// Do in-memory sorting up until internalSortLimit first
|
||||
for (int i = 0; i < length; i += width) {
|
||||
quickSort(start + i, start + i + Math.min(width, length-i));
|
||||
}
|
||||
|
||||
// Then finish with merge sort
|
||||
for (width = 1; width < length; width*=2) {
|
||||
|
||||
for (int i = 0; i < length; i += 2*width) {
|
||||
_merge(start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer);
|
||||
}
|
||||
|
||||
workBuffer.clear();
|
||||
set(start, start + length, workBuffer, 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
default void _mergeN(int wordSize, long offset, int left, int right, int end, LongBuffer workBuffer) {
|
||||
long idxL = left;
|
||||
long idxR = right;
|
||||
|
||||
for (int putPos = left; putPos < end; putPos+= wordSize) {
|
||||
|
||||
if (idxL < right && (idxR >= end || get(offset+idxL) < get(offset+idxR))) {
|
||||
workBuffer.put(putPos, get(offset+idxL));
|
||||
for (int s = 1; s < wordSize; s++) {
|
||||
workBuffer.put(putPos + s, get(offset + idxL + s));
|
||||
}
|
||||
idxL+= wordSize;
|
||||
}
|
||||
else {
|
||||
workBuffer.put(putPos, get(offset+idxR));
|
||||
for (int s = 1; s < wordSize; s++) {
|
||||
workBuffer.put(putPos + s, get(offset + idxR + s));
|
||||
}
|
||||
idxR+= wordSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
default void _merge(long offset, int left, int right, int end, LongBuffer workBuffer) {
|
||||
long idxL = left;
|
||||
long idxR = right;
|
||||
|
||||
for (int putPos = left; putPos < end; putPos++) {
|
||||
if (idxL < right && (idxR >= end || get(offset+idxL) < get(offset+idxR))) {
|
||||
workBuffer.put(putPos, get(offset+idxL));
|
||||
idxL++;
|
||||
}
|
||||
else {
|
||||
workBuffer.put(putPos, get(offset+idxR));
|
||||
idxR++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.util.array.algo;
|
||||
|
||||
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
||||
import nu.marginalia.util.array.functional.LongIOTransformer;
|
||||
import nu.marginalia.util.array.functional.LongLongConsumer;
|
||||
import nu.marginalia.util.array.functional.LongTransformer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface LongArrayTransformations extends LongArrayBase {
|
||||
|
||||
default void forEach(long start, long end, LongLongConsumer consumer) {
|
||||
for (long i = start; i < end; i++) {
|
||||
consumer.accept(i, get(i));
|
||||
}
|
||||
}
|
||||
|
||||
default void transformEach(long start, long end, LongTransformer transformer) {
|
||||
for (long i = start; i < end; i++) {
|
||||
set(i, transformer.transform(i, get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
default void transformEachIO(long start, long end, LongIOTransformer transformer) throws IOException {
|
||||
for (long i = start; i < end; i++) {
|
||||
set(i, transformer.transform(i, get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
default long foldIO(long zero, long start, long end, LongBinaryIOOperation operator) throws IOException {
|
||||
long accumulator = zero;
|
||||
|
||||
for (long i = start; i < end; i++) {
|
||||
accumulator = operator.apply(accumulator, get(i));
|
||||
}
|
||||
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
package nu.marginalia.util.array.algo;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public record SortingContext(Path tempDir, int memorySortLimit) {
|
||||
}
|
@ -0,0 +1,112 @@
|
||||
package nu.marginalia.util.array.buffer;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class IntQueryBuffer {
|
||||
public final int[] data;
|
||||
public int end;
|
||||
|
||||
private int read = 0;
|
||||
private int write = 0;
|
||||
|
||||
public IntQueryBuffer(int size) {
|
||||
this.data = new int[size];
|
||||
this.end = size;
|
||||
}
|
||||
|
||||
public IntQueryBuffer(int [] data, int size) {
|
||||
this.data = data;
|
||||
this.end = size;
|
||||
}
|
||||
|
||||
public int[] copyData() {
|
||||
return Arrays.copyOf(data, end);
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return end == 0;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public int currentValue() {
|
||||
return data[read];
|
||||
}
|
||||
|
||||
public boolean rejectAndAdvance() {
|
||||
return ++read < end;
|
||||
}
|
||||
|
||||
public boolean retainAndAdvance() {
|
||||
if (read != write) {
|
||||
int tmp = data[write];
|
||||
data[write] = data[read];
|
||||
data[read] = tmp;
|
||||
}
|
||||
|
||||
write++;
|
||||
|
||||
return ++read < end;
|
||||
}
|
||||
|
||||
public boolean hasMore() {
|
||||
return read < end;
|
||||
}
|
||||
|
||||
public void finalizeFiltering() {
|
||||
end = write;
|
||||
read = 0;
|
||||
write = 0;
|
||||
}
|
||||
|
||||
public void startFilterForRange(int pos, int end) {
|
||||
read = write = pos;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
end = data.length;
|
||||
read = 0;
|
||||
write = 0;
|
||||
}
|
||||
|
||||
public void zero() {
|
||||
end = 0;
|
||||
read = 0;
|
||||
write = 0;
|
||||
Arrays.fill(data, 0);
|
||||
}
|
||||
|
||||
public void uniq() {
|
||||
if (end <= 1) return;
|
||||
|
||||
int prev = currentValue();
|
||||
retainAndAdvance();
|
||||
|
||||
while (hasMore()) {
|
||||
|
||||
int val = currentValue();
|
||||
|
||||
if (prev == val) {
|
||||
rejectAndAdvance();
|
||||
} else {
|
||||
retainAndAdvance();
|
||||
prev = val;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
finalizeFiltering();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + "[" +
|
||||
"read = " + read +
|
||||
",write = " + write +
|
||||
",end = " + end +
|
||||
",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]";
|
||||
}
|
||||
|
||||
}
|
@ -1,62 +1,32 @@
|
||||
package nu.marginalia.util.btree;
|
||||
package nu.marginalia.util.array.buffer;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class BTreeQueryBuffer {
|
||||
public class LongQueryBuffer {
|
||||
public final long[] data;
|
||||
public int end;
|
||||
|
||||
private int read = 0;
|
||||
private int write = 0;
|
||||
|
||||
public BTreeQueryBuffer(int size) {
|
||||
public LongQueryBuffer(int size) {
|
||||
this.data = new long[size];
|
||||
this.end = size;
|
||||
}
|
||||
|
||||
public BTreeQueryBuffer(long [] data, int size) {
|
||||
public LongQueryBuffer(long [] data, int size) {
|
||||
this.data = data;
|
||||
this.end = size;
|
||||
}
|
||||
|
||||
private BTreeQueryBuffer(long [] data) {
|
||||
this.data = data;
|
||||
this.end = data.length;
|
||||
}
|
||||
|
||||
public BTreeQueryBuffer[] split(int... splitPoints) {
|
||||
BTreeQueryBuffer[] ret = new BTreeQueryBuffer[splitPoints.length+1];
|
||||
|
||||
ret[0] = new BTreeQueryBuffer(Arrays.copyOfRange(data, 0, splitPoints[0]));
|
||||
for (int i = 1; i < splitPoints.length; i++) {
|
||||
ret[i] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[i-1], splitPoints[i]));
|
||||
}
|
||||
ret[ret.length-1] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[splitPoints.length-1], end));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public void gather(BTreeQueryBuffer... buffers) {
|
||||
int start = 0;
|
||||
|
||||
for (var buffer : buffers) {
|
||||
System.arraycopy(buffer.data, 0, data, start, buffer.end);
|
||||
start += buffer.end;
|
||||
}
|
||||
|
||||
this.read = 0;
|
||||
this.write = 0;
|
||||
this.end = start;
|
||||
public boolean hasRetainedData() {
|
||||
return write > 0;
|
||||
}
|
||||
|
||||
public long[] copyData() {
|
||||
return Arrays.copyOf(data, end);
|
||||
}
|
||||
|
||||
public void retainAll() {
|
||||
read = write = end;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return end == 0;
|
||||
}
|
@ -0,0 +1,58 @@
|
||||
package nu.marginalia.util.array.delegate;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.IntArray;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ReferenceImplIntArrayDelegate implements IntArray {
|
||||
|
||||
private final IntArray delegate;
|
||||
|
||||
public ReferenceImplIntArrayDelegate(IntArray delegate) {
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int get(long pos) {
|
||||
return delegate.get(pos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long pos, int value) {
|
||||
delegate.set(pos, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return delegate.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Path file) throws IOException {
|
||||
delegate.write(file);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
|
||||
delegate.transferFrom(source, sourceStart, arrayStart, arrayEnd);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void force() {
|
||||
delegate.force();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice) throws IOException {
|
||||
delegate.advice(advice);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
|
||||
delegate.advice(advice, start, end);
|
||||
}
|
||||
}
|
@ -0,0 +1,58 @@
|
||||
package nu.marginalia.util.array.delegate;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ReferenceImplLongArrayDelegate implements LongArray {
|
||||
|
||||
private final LongArray delegate;
|
||||
|
||||
public ReferenceImplLongArrayDelegate(LongArray delegate) {
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long get(long pos) {
|
||||
return delegate.get(pos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long pos, long value) {
|
||||
delegate.set(pos, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return delegate.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Path file) throws IOException {
|
||||
delegate.write(file);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
|
||||
delegate.transferFrom(source, sourceStart, arrayStart, arrayEnd);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void force() {
|
||||
delegate.force();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice) throws IOException {
|
||||
delegate.advice(advice);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
|
||||
delegate.advice(advice, start, end);
|
||||
}
|
||||
}
|
@ -0,0 +1,199 @@
|
||||
package nu.marginalia.util.array.delegate;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.IntArray;
|
||||
import nu.marginalia.util.array.buffer.IntQueryBuffer;
|
||||
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
|
||||
import nu.marginalia.util.array.functional.IntIOTransformer;
|
||||
import nu.marginalia.util.array.functional.IntTransformer;
|
||||
import nu.marginalia.util.array.functional.LongIntConsumer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ShiftedIntArray implements IntArray {
|
||||
public final long shift;
|
||||
public final long size;
|
||||
|
||||
private final IntArray delegate;
|
||||
|
||||
public ShiftedIntArray(long shift, IntArray delegate) {
|
||||
this.shift = shift;
|
||||
this.size = delegate.size() - shift;
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
public ShiftedIntArray(long start, long end, IntArray delegate) {
|
||||
this.shift = start;
|
||||
this.size = end - start;
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int get(long pos) {
|
||||
return delegate.get(pos+shift);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long pos, int value) {
|
||||
delegate.set(pos+shift, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long start, long end, IntBuffer buffer, int bufferStart) {
|
||||
delegate.set(shift + start, shift + end, buffer, bufferStart);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long start, long end, IntBuffer buffer, int bufferStart) {
|
||||
delegate.get(shift + start, shift + end, buffer, bufferStart);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long start, IntBuffer buffer) {
|
||||
delegate.get(shift + start, buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long start, long end, int[] buffer) {
|
||||
delegate.get(shift+start, shift+end, buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Path file) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ShiftedIntArray shifted(long offset) {
|
||||
return new ShiftedIntArray(shift+offset, delegate);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ShiftedIntArray range(long start, long end) {
|
||||
return new ShiftedIntArray(shift + start, shift+end, delegate);
|
||||
}
|
||||
|
||||
public int[] toArray() {
|
||||
int[] ret = new int[(int) size];
|
||||
for (int i = 0; i < size; i++) {
|
||||
ret[i] = delegate.get(shift + i);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public boolean isSorted() {
|
||||
return isSorted(0, size);
|
||||
}
|
||||
|
||||
public boolean isSorted(long start, long end) {
|
||||
return delegate.isSorted(shift + start, shift + end);
|
||||
}
|
||||
|
||||
public long search(int key) {
|
||||
if (size < 128) {
|
||||
return linearSearch(key);
|
||||
}
|
||||
else {
|
||||
return binarySearch(key);
|
||||
}
|
||||
}
|
||||
|
||||
public long linearSearch(int key) {
|
||||
return linearSearch(key, 0, size);
|
||||
}
|
||||
|
||||
public long binarySearch(int key) {
|
||||
return binarySearch(key, 0, size);
|
||||
}
|
||||
|
||||
public long binarySearchUpperbound(int key) {
|
||||
return binarySearchUpperBound(key, 0, size);
|
||||
}
|
||||
|
||||
public void retain(IntQueryBuffer buffer, long boundary) {
|
||||
retain(buffer, boundary, 0, size);
|
||||
}
|
||||
|
||||
public void reject(IntQueryBuffer buffer, long boundary) {
|
||||
reject(buffer, boundary, 0, size);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long linearSearch(int key, long fromIndex, long toIndex) {
|
||||
return translateSearchResult(delegate.linearSearch(key, fromIndex + shift, toIndex+shift));
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(int key, long fromIndex, long toIndex) {
|
||||
return translateSearchResult(delegate.binarySearch(key, fromIndex + shift, toIndex+shift));
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpperBound(int key, long fromIndex, long toIndex) {
|
||||
return translateSearchResult(delegate.binarySearchUpperBound(key, fromIndex + shift, toIndex+shift));
|
||||
}
|
||||
|
||||
private long translateSearchResult(long ret) {
|
||||
if (ret > 0) return ret - shift;
|
||||
return ret + shift;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void retain(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
delegate.retain(buffer, boundary, searchStart + shift, searchEnd + shift);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reject(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
delegate.reject(buffer, boundary, searchStart + shift, searchEnd + shift);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEach(long start, long end, LongIntConsumer consumer) {
|
||||
delegate.forEach(start + shift, end+shift, (pos, old) -> consumer.accept(pos-shift, old));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transformEach(long start, long end, IntTransformer transformer) {
|
||||
delegate.transformEach(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transformEachIO(long start, long end, IntIOTransformer transformer) throws IOException {
|
||||
delegate.transformEachIO(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int foldIO(int zero, long start, long end, IntBinaryIOOperation operator) throws IOException {
|
||||
return delegate.foldIO(zero, start + shift, end+shift, operator);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
|
||||
delegate.transferFrom(source, sourceStart, shift + arrayStart, shift + arrayEnd);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void force() {
|
||||
delegate.force();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice) throws IOException {
|
||||
delegate.advice(advice, shift, shift + size());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
|
||||
delegate.advice(advice, start + shift, end + shift);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,255 @@
|
||||
package nu.marginalia.util.array.delegate;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.array.algo.LongArraySearch;
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
||||
import nu.marginalia.util.array.functional.LongIOTransformer;
|
||||
import nu.marginalia.util.array.functional.LongLongConsumer;
|
||||
import nu.marginalia.util.array.functional.LongTransformer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ShiftedLongArray implements LongArray {
|
||||
public final long shift;
|
||||
public final long size;
|
||||
private final LongArray delegate;
|
||||
|
||||
public ShiftedLongArray(long shift, LongArray delegate) {
|
||||
this.shift = shift;
|
||||
this.size = delegate.size() - shift;
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
public ShiftedLongArray(long start, long end, LongArray delegate) {
|
||||
this.shift = start;
|
||||
this.size = end - start;
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public long get(long pos) {
|
||||
return delegate.get(pos+shift);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long pos, long value) {
|
||||
delegate.set(pos+shift, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long start, long end, LongBuffer buffer, int bufferStart) {
|
||||
delegate.set(shift + start, shift + end, buffer, bufferStart);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long start, long end, LongBuffer buffer, int bufferStart) {
|
||||
delegate.get(shift + start, shift + end, buffer, bufferStart);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long start, LongBuffer buffer) {
|
||||
delegate.get(shift + start, buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long start, long end, long[] buffer) {
|
||||
delegate.get(shift+start, shift+end, buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Path file) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ShiftedLongArray shifted(long offset) {
|
||||
return new ShiftedLongArray(shift+offset, delegate);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ShiftedLongArray range(long start, long end) {
|
||||
return new ShiftedLongArray(shift + start, shift+end, delegate);
|
||||
}
|
||||
|
||||
public long[] toArray() {
|
||||
long[] ret = new long[(int) size];
|
||||
for (int i = 0; i < size; i++) {
|
||||
ret[i] = delegate.get(shift + i);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public boolean isSorted() {
|
||||
return isSorted(0, size);
|
||||
}
|
||||
|
||||
public boolean isSortedN(int sz) {
|
||||
return isSortedN(sz, 0, size);
|
||||
}
|
||||
|
||||
public boolean isSorted(long start, long end) {
|
||||
return delegate.isSorted(shift + start, shift + end);
|
||||
}
|
||||
|
||||
public boolean isSortedN(int sz, long start, long end) {
|
||||
return delegate.isSortedN(sz, shift + start, shift + end);
|
||||
}
|
||||
|
||||
public long searchN(int sz, long key) {
|
||||
if (size < 128) {
|
||||
return linearSearchN(sz, key);
|
||||
}
|
||||
else {
|
||||
return binarySearchN(sz, key);
|
||||
}
|
||||
}
|
||||
|
||||
public long search(long key) {
|
||||
if (size < 128) {
|
||||
return linearSearch(key);
|
||||
}
|
||||
else {
|
||||
return binarySearch(key);
|
||||
}
|
||||
}
|
||||
|
||||
public long linearSearch(long key) {
|
||||
return linearSearch(key, 0, size);
|
||||
}
|
||||
|
||||
public long binarySearch(long key) {
|
||||
return binarySearch(key, 0, size);
|
||||
}
|
||||
|
||||
public long binarySearchN(int sz, long key) {
|
||||
return binarySearchN(sz, key, 0, size);
|
||||
}
|
||||
|
||||
public long linearSearchN(int sz, long key) {
|
||||
return linearSearchN(sz, key, 0, size);
|
||||
}
|
||||
|
||||
public void retain(LongQueryBuffer buffer, long boundary) {
|
||||
retain(buffer, boundary, 0, size);
|
||||
}
|
||||
public void retainN(LongQueryBuffer buffer, int sz, long boundary) {
|
||||
if (sz == 1)
|
||||
retain(buffer, boundary, 0, size);
|
||||
else
|
||||
retainN(buffer, sz, boundary, 0, size);
|
||||
}
|
||||
|
||||
public void reject(LongQueryBuffer buffer, long boundary) {
|
||||
reject(buffer, boundary, 0, size);
|
||||
}
|
||||
|
||||
public void rejectN(LongQueryBuffer buffer, int sz, long boundary) {
|
||||
if (sz == 1)
|
||||
reject(buffer, boundary, 0, size);
|
||||
else
|
||||
rejectN(buffer, sz, boundary, 0, size);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public long linearSearch(long key, long fromIndex, long toIndex) {
|
||||
return translateSearchResult(delegate.linearSearch(key, fromIndex + shift, toIndex+shift));
|
||||
}
|
||||
@Override
|
||||
public long linearSearchN(int sz, long key, long fromIndex, long toIndex) {
|
||||
return translateSearchResult(delegate.linearSearch(key, fromIndex + shift, toIndex+shift));
|
||||
}
|
||||
@Override
|
||||
public long binarySearch(long key, long fromIndex, long toIndex) {
|
||||
return translateSearchResult(delegate.binarySearch(key, fromIndex + shift, toIndex+shift));
|
||||
}
|
||||
@Override
|
||||
public long binarySearchN(int sz, long key, long fromIndex, long toIndex) {
|
||||
return translateSearchResult(delegate.binarySearchN(sz, key, fromIndex + shift, toIndex+shift));
|
||||
}
|
||||
@Override
|
||||
public long binarySearchUpperBound(long key, long fromIndex, long toIndex) {
|
||||
return translateSearchResult(delegate.binarySearchUpperBound(key, fromIndex + shift, toIndex+shift));
|
||||
}
|
||||
@Override
|
||||
public long linearSearchUpperBound(long key, long fromIndex, long toIndex) {
|
||||
return translateSearchResult(delegate.linearSearchUpperBound(key, fromIndex + shift, toIndex+shift));
|
||||
}
|
||||
@Override
|
||||
public long binarySearchUpperBoundN(int sz, long key, long fromIndex, long toIndex) {
|
||||
return translateSearchResult(delegate.binarySearchUpperBoundN(sz, key, fromIndex + shift, toIndex+shift));
|
||||
}
|
||||
private long translateSearchResult(long delegatedIdx) {
|
||||
long ret;
|
||||
|
||||
if (delegatedIdx >= 0) ret = delegatedIdx - shift;
|
||||
else ret = LongArraySearch.encodeSearchMiss(Math.max(0, LongArraySearch.decodeSearchMiss(delegatedIdx) - shift));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public void retain(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
delegate.retain(buffer, boundary, searchStart + shift, searchEnd + shift);
|
||||
}
|
||||
public void retainN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
|
||||
delegate.retainN(buffer, sz, boundary, searchStart + shift, searchEnd + shift);
|
||||
}
|
||||
public void reject(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
delegate.reject(buffer, boundary, searchStart + shift, searchEnd + shift);
|
||||
}
|
||||
public void rejectN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
|
||||
delegate.rejectN(buffer, sz, boundary, searchStart + shift, searchEnd + shift);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEach(long start, long end, LongLongConsumer consumer) {
|
||||
delegate.forEach(start + shift, end+shift, (pos, old) -> consumer.accept(pos-shift, old));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transformEach(long start, long end, LongTransformer transformer) {
|
||||
delegate.transformEach(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transformEachIO(long start, long end, LongIOTransformer transformer) throws IOException {
|
||||
delegate.transformEachIO(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old));
|
||||
}
|
||||
|
||||
@Override
|
||||
public long foldIO(long zero, long start, long end, LongBinaryIOOperation operator) throws IOException {
|
||||
return delegate.foldIO(zero, start + shift, end+shift, operator);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
|
||||
delegate.transferFrom(source, sourceStart, shift + arrayStart, shift + arrayEnd);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void force() {
|
||||
delegate.force();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice) throws IOException {
|
||||
delegate.advice(advice, shift, shift + size());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
|
||||
delegate.advice(advice, start + shift, end + shift);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
public interface AddressRangeCall<T> {
|
||||
void apply(T array, int start, int end);
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface AddressRangeCallIO<T> {
|
||||
void apply(T array, int start, int end) throws IOException;
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
public interface AddressRangeIntFunction<T> {
|
||||
int apply(T array, int start, int end);
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
public interface AddressRangeLongFunction<T> {
|
||||
long apply(T array, int start, int end);
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface IntBinaryIOOperation {
|
||||
int apply(int left, int right) throws IOException;
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface IntIOTransformer {
|
||||
int transform(long pos, int old) throws IOException;
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
public interface IntTransformer {
|
||||
int transform(long pos, int old);
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface LongBinaryIOOperation {
|
||||
long apply(long left, long right) throws IOException;
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface LongIOTransformer {
|
||||
long transform(long pos, long old) throws IOException;
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
public interface LongIntConsumer {
|
||||
void accept(long pos, int val);
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
public interface LongLongConsumer {
|
||||
void accept(long pos, long val);
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.util.array.functional;
|
||||
|
||||
public interface LongTransformer {
|
||||
long transform(long pos, long old);
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
package nu.marginalia.util.array.functor;
|
||||
|
||||
import nu.marginalia.util.array.functional.AddressRangeCallIO;
|
||||
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
|
||||
import nu.marginalia.util.array.page.IntArrayPage;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class IntIOFolder implements AddressRangeCallIO<IntArrayPage> {
|
||||
public int acc;
|
||||
private final IntBinaryIOOperation operator;
|
||||
|
||||
public IntIOFolder(int zero, IntBinaryIOOperation operator) {
|
||||
this.acc = zero;
|
||||
this.operator = operator;
|
||||
}
|
||||
|
||||
public void apply(IntArrayPage array, int start, int end) throws IOException {
|
||||
acc = array.foldIO(acc, start, end, operator);
|
||||
}
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
package nu.marginalia.util.array.functor;
|
||||
|
||||
import nu.marginalia.util.array.functional.AddressRangeCallIO;
|
||||
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
||||
import nu.marginalia.util.array.page.LongArrayPage;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class LongIOFolder implements AddressRangeCallIO<LongArrayPage> {
|
||||
public long acc;
|
||||
private final LongBinaryIOOperation operator;
|
||||
|
||||
public LongIOFolder(long zero, LongBinaryIOOperation operator) {
|
||||
this.acc = zero;
|
||||
this.operator = operator;
|
||||
}
|
||||
|
||||
public void apply(LongArrayPage array, int start, int end) throws IOException {
|
||||
acc = array.foldIO(acc, start, end, operator);
|
||||
}
|
||||
}
|
@ -0,0 +1,88 @@
|
||||
package nu.marginalia.util.array.page;
|
||||
|
||||
import nu.marginalia.util.array.algo.BulkTransferArray;
|
||||
import nu.marginalia.util.array.functional.AddressRangeCall;
|
||||
import nu.marginalia.util.array.functional.AddressRangeCallIO;
|
||||
import nu.marginalia.util.array.scheme.ArrayPartitioningScheme;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static nu.marginalia.util.array.algo.LongArraySearch.decodeSearchMiss;
|
||||
import static nu.marginalia.util.array.algo.LongArraySearch.encodeSearchMiss;
|
||||
|
||||
public class AbstractPagingArray<T extends BulkTransferArray<B>, B> {
|
||||
final T[] pages;
|
||||
final long size;
|
||||
final ArrayPartitioningScheme partitioningScheme;
|
||||
|
||||
public AbstractPagingArray(ArrayPartitioningScheme partitioningScheme, T[] pages, long size) {
|
||||
this.partitioningScheme = partitioningScheme;
|
||||
this.pages = pages;
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
void delegateToEachPage(long start, long end, AddressRangeCall<T> fn) {
|
||||
assert end >= start;
|
||||
|
||||
int page = partitioningScheme.getPage(start);
|
||||
|
||||
long endPos;
|
||||
|
||||
for (long pos = start; pos < end; pos = endPos) {
|
||||
endPos = partitioningScheme.getPageEnd(pos, end);
|
||||
|
||||
int sOff = partitioningScheme.getOffset(pos);
|
||||
int eOff = partitioningScheme.getEndOffset(start, endPos);
|
||||
|
||||
fn.apply(pages[page++], sOff, eOff);
|
||||
}
|
||||
}
|
||||
|
||||
void delegateToEachPageIO(long start, long end, AddressRangeCallIO<T> fn) throws IOException {
|
||||
assert end >= start;
|
||||
|
||||
int page = partitioningScheme.getPage(start);
|
||||
|
||||
long endPos;
|
||||
|
||||
for (long pos = start; pos < end; pos = endPos) {
|
||||
endPos = partitioningScheme.getPageEnd(pos, end);
|
||||
|
||||
int sOff = partitioningScheme.getOffset(pos);
|
||||
int eOff = partitioningScheme.getEndOffset(start, endPos);
|
||||
|
||||
fn.apply(pages[page++], sOff, eOff);
|
||||
}
|
||||
}
|
||||
|
||||
long translateSearchResultsFromPage(long fromIndex, long ret) {
|
||||
int page = partitioningScheme.getPage(fromIndex);
|
||||
|
||||
if (ret >= 0) {
|
||||
return partitioningScheme.toRealIndex(page, (int) ret);
|
||||
} else {
|
||||
ret = decodeSearchMiss(ret);
|
||||
ret = partitioningScheme.toRealIndex(page, (int) ret);
|
||||
return encodeSearchMiss(ret);
|
||||
}
|
||||
}
|
||||
|
||||
public void set(long start, long end, B buffer, int bufferStart) {
|
||||
assert end >= start;
|
||||
|
||||
int page = partitioningScheme.getPage(start);
|
||||
|
||||
long endPos;
|
||||
|
||||
for (long pos = start; pos < end; pos = endPos) {
|
||||
endPos = partitioningScheme.getPageEnd(pos, end);
|
||||
|
||||
int sOff = partitioningScheme.getOffset(pos);
|
||||
int eOff = partitioningScheme.getEndOffset(start, endPos);
|
||||
|
||||
pages[page++].set(sOff, eOff, buffer, bufferStart);
|
||||
|
||||
bufferStart += eOff - sOff;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,120 @@
|
||||
package nu.marginalia.util.array.page;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.IntArray;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.MappedByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.OpenOption;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class IntArrayPage implements PartitionPage, IntArray {
|
||||
|
||||
final IntBuffer intBuffer;
|
||||
final ByteBuffer byteBuffer;
|
||||
|
||||
private IntArrayPage(ByteBuffer byteBuffer) {
|
||||
this.byteBuffer = byteBuffer;
|
||||
this.intBuffer = byteBuffer.asIntBuffer();
|
||||
}
|
||||
|
||||
public static IntArrayPage onHeap(int size) {
|
||||
return new IntArrayPage(ByteBuffer.allocateDirect(WORD_SIZE*size));
|
||||
}
|
||||
|
||||
public static IntArrayPage fromMmapReadOnly(Path file, long offset, int size) throws IOException {
|
||||
return new IntArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_ONLY, StandardOpenOption.READ));
|
||||
}
|
||||
|
||||
public static IntArrayPage fromMmapReadWrite(Path file, long offset, int size) throws IOException {
|
||||
return new IntArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_WRITE, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE));
|
||||
}
|
||||
|
||||
private static ByteBuffer mmapFile(Path file, long offset, int size, FileChannel.MapMode mode, OpenOption... openOptions) throws IOException {
|
||||
try (var channel = (FileChannel) Files.newByteChannel(file, openOptions)) {
|
||||
return channel.map(mode, WORD_SIZE*offset, (long) size*WORD_SIZE);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new IOException("Failed to map file " + file + " (" + offset + ":" + size + ")", ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int get(long at) {
|
||||
return intBuffer.get((int) at);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long start, long end, int[] buffer) {
|
||||
intBuffer.get((int) start, buffer, 0, (int) (end - start));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long at, int val) {
|
||||
intBuffer.put((int) at, val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long start, long end, IntBuffer buffer, int bufferStart) {
|
||||
intBuffer.put((int) start, buffer, bufferStart, (int) (end-start));
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return intBuffer.capacity();
|
||||
}
|
||||
|
||||
public void increment(int at) {
|
||||
set(at, get(at) + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ByteBuffer getByteBuffer() {
|
||||
return byteBuffer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Path filename) throws IOException {
|
||||
try (var channel = (FileChannel) Files.newByteChannel(filename, StandardOpenOption.WRITE, StandardOpenOption.CREATE)) {
|
||||
write(channel);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void force() {
|
||||
if (byteBuffer instanceof MappedByteBuffer mb) {
|
||||
mb.force();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
|
||||
|
||||
int index = (int) (arrayStart * WORD_SIZE);
|
||||
int length = (int) ((arrayEnd - arrayStart) * WORD_SIZE);
|
||||
|
||||
var slice = byteBuffer.slice(index, length);
|
||||
|
||||
long startPos = sourceStart * WORD_SIZE;
|
||||
while (slice.position() < slice.capacity()) {
|
||||
source.read(slice, startPos + slice.position());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice) throws IOException {
|
||||
NativeIO.madvise((MappedByteBuffer) byteBuffer, advice);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
|
||||
NativeIO.madviseRange((MappedByteBuffer) byteBuffer, advice, (int) start, (int) (end-start));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,135 @@
|
||||
package nu.marginalia.util.array.page;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.array.trace.ArrayTrace;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.MappedByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.OpenOption;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class LongArrayPage implements PartitionPage, LongArray {
|
||||
|
||||
final ArrayTrace trace = ArrayTrace.get(this);
|
||||
|
||||
final LongBuffer longBuffer;
|
||||
final ByteBuffer byteBuffer;
|
||||
|
||||
private LongArrayPage(ByteBuffer byteBuffer) {
|
||||
this.byteBuffer = byteBuffer;
|
||||
this.longBuffer = byteBuffer.asLongBuffer();
|
||||
}
|
||||
|
||||
public static LongArrayPage onHeap(int size) {
|
||||
return new LongArrayPage(ByteBuffer.allocateDirect(WORD_SIZE*size));
|
||||
}
|
||||
|
||||
public static LongArrayPage fromMmapReadOnly(Path file, long offset, int size) throws IOException {
|
||||
return new LongArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_ONLY, StandardOpenOption.READ));
|
||||
}
|
||||
|
||||
public static LongArrayPage fromMmapReadWrite(Path file, long offset, int size) throws IOException {
|
||||
return new LongArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_WRITE, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE));
|
||||
}
|
||||
|
||||
private static ByteBuffer mmapFile(Path file, long offset, int size, FileChannel.MapMode mode, OpenOption... openOptions) throws IOException {
|
||||
try (var channel = (FileChannel) Files.newByteChannel(file, openOptions)) {
|
||||
return channel.map(mode, WORD_SIZE*offset, (long) size*WORD_SIZE);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new IOException("Failed to map file " + file + " (" + offset + ":" + size + ")", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long get(long at) {
|
||||
try {
|
||||
trace.touch(at);
|
||||
|
||||
return longBuffer.get((int) at);
|
||||
}
|
||||
catch (IndexOutOfBoundsException ex) {
|
||||
throw new IndexOutOfBoundsException("@" + at + "(" + 0 + ":" + longBuffer.capacity() + ")");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long start, long end, long[] buffer) {
|
||||
trace.touch(start, end);
|
||||
|
||||
longBuffer.get((int) start, buffer, 0, (int) (end - start));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long at, long val) {
|
||||
trace.touch(at);
|
||||
|
||||
longBuffer.put((int) at, val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long start, long end, LongBuffer buffer, int bufferStart) {
|
||||
longBuffer.put((int) start, buffer, bufferStart, (int) (end-start));
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return longBuffer.capacity();
|
||||
}
|
||||
|
||||
public void increment(int at) {
|
||||
set(at, get(at) + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ByteBuffer getByteBuffer() {
|
||||
return byteBuffer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Path filename) throws IOException {
|
||||
try (var channel = (FileChannel) Files.newByteChannel(filename, StandardOpenOption.WRITE, StandardOpenOption.CREATE)) {
|
||||
write(channel);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void force() {
|
||||
if (byteBuffer instanceof MappedByteBuffer mb) {
|
||||
mb.force();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
|
||||
|
||||
trace.touch(arrayStart, arrayEnd);
|
||||
|
||||
int index = (int) (arrayStart * WORD_SIZE);
|
||||
int length = (int) ((arrayEnd - arrayStart) * WORD_SIZE);
|
||||
|
||||
var slice = byteBuffer.slice(index, length);
|
||||
|
||||
long startPos = sourceStart * WORD_SIZE;
|
||||
while (slice.position() < slice.capacity()) {
|
||||
source.read(slice, startPos + slice.position());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice) throws IOException {
|
||||
NativeIO.madvise((MappedByteBuffer) byteBuffer, advice);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
|
||||
NativeIO.madviseRange((MappedByteBuffer) byteBuffer, advice, (int) start, (int) (end-start));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,330 @@
|
||||
package nu.marginalia.util.array.page;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.IntArray;
|
||||
import nu.marginalia.util.array.buffer.IntQueryBuffer;
|
||||
import nu.marginalia.util.array.delegate.ReferenceImplIntArrayDelegate;
|
||||
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
|
||||
import nu.marginalia.util.array.functional.IntIOTransformer;
|
||||
import nu.marginalia.util.array.functional.IntTransformer;
|
||||
import nu.marginalia.util.array.functional.LongIntConsumer;
|
||||
import nu.marginalia.util.array.functor.IntIOFolder;
|
||||
import nu.marginalia.util.array.scheme.ArrayPartitioningScheme;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class PagingIntArray extends AbstractPagingArray<IntArrayPage, IntBuffer> implements IntArray {
|
||||
private final ReferenceImplIntArrayDelegate defaults;
|
||||
|
||||
private PagingIntArray(ArrayPartitioningScheme partitioningScheme,
|
||||
IntArrayPage[] pages,
|
||||
long size) {
|
||||
super(partitioningScheme, pages, size);
|
||||
|
||||
defaults = new ReferenceImplIntArrayDelegate(this);
|
||||
}
|
||||
|
||||
public static IntArray newOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) {
|
||||
if (cardinality < MAX_CONTINUOUS_SIZE) {
|
||||
return IntArrayPage.onHeap((int) cardinality);
|
||||
}
|
||||
|
||||
return newPartitionedOnHeap(partitioningScheme, cardinality);
|
||||
}
|
||||
|
||||
public static IntArray newPartitionedOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) {
|
||||
|
||||
IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(cardinality)];
|
||||
|
||||
for (int i = 0; i < pages.length; i++) {
|
||||
pages[i] = IntArrayPage.onHeap(partitioningScheme.getRequiredPageSize(i, cardinality));
|
||||
}
|
||||
|
||||
return new PagingIntArray(partitioningScheme, pages, cardinality);
|
||||
}
|
||||
|
||||
public static PagingIntArray mapFileReadOnly(ArrayPartitioningScheme partitioningScheme, Path file)
|
||||
throws IOException
|
||||
{
|
||||
long sizeBytes = Files.size(file);
|
||||
assert sizeBytes % WORD_SIZE == 0;
|
||||
|
||||
long size = sizeBytes / WORD_SIZE;
|
||||
|
||||
IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(size)];
|
||||
long offset = 0;
|
||||
for (int i = 0; i < pages.length; i++) {
|
||||
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
|
||||
pages[i] = IntArrayPage.fromMmapReadOnly(file, offset, partitionSize);
|
||||
offset += partitionSize;
|
||||
}
|
||||
|
||||
return new PagingIntArray(partitioningScheme, pages, size);
|
||||
}
|
||||
|
||||
|
||||
public static PagingIntArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file)
|
||||
throws IOException
|
||||
{
|
||||
long sizeBytes = Files.size(file);
|
||||
assert sizeBytes % LongArrayPage.WORD_SIZE == 0;
|
||||
|
||||
long size = sizeBytes / LongArrayPage.WORD_SIZE;
|
||||
|
||||
IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(size)];
|
||||
long offset = 0;
|
||||
for (int i = 0; i < pages.length; i++) {
|
||||
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
|
||||
pages[i] = IntArrayPage.fromMmapReadWrite(file, offset, partitionSize);
|
||||
offset += partitionSize;
|
||||
}
|
||||
|
||||
return new PagingIntArray(partitioningScheme, pages, size);
|
||||
}
|
||||
|
||||
public static PagingIntArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file, long size)
|
||||
throws IOException
|
||||
{
|
||||
IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(size)];
|
||||
long offset = 0;
|
||||
for (int i = 0; i < pages.length; i++) {
|
||||
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
|
||||
pages[i] = IntArrayPage.fromMmapReadWrite(file, offset, partitionSize);
|
||||
offset += partitionSize;
|
||||
}
|
||||
|
||||
return new PagingIntArray(partitioningScheme, pages, size);
|
||||
}
|
||||
|
||||
public int get(long pos) {
|
||||
int page = partitioningScheme.getPage(pos);
|
||||
int offset = partitioningScheme.getOffset(pos);
|
||||
|
||||
try {
|
||||
return pages[page].get(partitioningScheme.getOffset(pos));
|
||||
}
|
||||
catch (IndexOutOfBoundsException ex) {
|
||||
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long start, long end, int[] buffer) {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
int sOff = partitioningScheme.getOffset(start);
|
||||
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||
|
||||
pages[partitioningScheme.getPage(start)].get(sOff, eOff, buffer);
|
||||
}
|
||||
else {
|
||||
defaults.get(start, end, buffer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long pos, int value) {
|
||||
int page = partitioningScheme.getPage(pos);
|
||||
int offset = partitioningScheme.getOffset(pos);
|
||||
try {
|
||||
pages[page].set(offset, value);
|
||||
}
|
||||
catch (IndexOutOfBoundsException ex) {
|
||||
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void increment(long pos) {
|
||||
int page = partitioningScheme.getPage(pos);
|
||||
int offset = partitioningScheme.getOffset(pos);
|
||||
|
||||
try {
|
||||
pages[page].increment(partitioningScheme.getOffset(pos));
|
||||
}
|
||||
catch (IndexOutOfBoundsException ex) {
|
||||
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEach(long start, long end, LongIntConsumer consumer) {
|
||||
delegateToEachPage(start, end, (page, s, e) -> page.forEach(s, e, consumer));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fill(long fromIndex, long toIndex, int value) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
pages[partitioningScheme.getPage(fromIndex)].fill(sOff, eOff, value);
|
||||
}
|
||||
else if (toIndex >= fromIndex) {
|
||||
delegateToEachPage(fromIndex, toIndex, (page, s, e) -> page.fill(s, e, value));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transformEach(long start, long end, IntTransformer transformer) {
|
||||
delegateToEachPage(start, end, (page, s, e) -> page.transformEach(s, e, transformer));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transformEachIO(long start, long end, IntIOTransformer transformer) throws IOException {
|
||||
delegateToEachPageIO(start, end, (page, s, e) -> page.transformEachIO(s, e, transformer));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int foldIO(int zero, long start, long end, IntBinaryIOOperation operator) throws IOException {
|
||||
var folder = new IntIOFolder(zero, operator);
|
||||
|
||||
delegateToEachPageIO(start, end, folder);
|
||||
|
||||
return folder.acc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long linearSearch(int key, long fromIndex, long toIndex) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearch(key, sOff, eOff);
|
||||
|
||||
return translateSearchResultsFromPage(fromIndex, ret);
|
||||
}
|
||||
else {
|
||||
return defaults.linearSearch(key, fromIndex, toIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(int key, long fromIndex, long toIndex) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearch(key, sOff, eOff);
|
||||
|
||||
return translateSearchResultsFromPage(fromIndex, ret);
|
||||
}
|
||||
else {
|
||||
return defaults.binarySearch(key, fromIndex, toIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpperBound(int key, long fromIndex, long toIndex) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchUpperBound(key, sOff, eOff);
|
||||
|
||||
return translateSearchResultsFromPage(fromIndex, ret);
|
||||
}
|
||||
else {
|
||||
return defaults.binarySearchUpperBound(key, fromIndex, toIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void retain(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
|
||||
int sOff = partitioningScheme.getOffset(searchStart);
|
||||
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(searchStart)].retain(buffer, boundary, sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.retain(buffer, boundary, searchStart, searchEnd);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void reject(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
|
||||
int sOff = partitioningScheme.getOffset(searchStart);
|
||||
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(searchStart)].reject(buffer, boundary, sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.reject(buffer, boundary, searchStart, searchEnd);
|
||||
}
|
||||
}
|
||||
|
||||
public void write(Path fileName) throws IOException {
|
||||
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
||||
for (int i = 0; i < pages.length; i++) {
|
||||
pages[i].write(channel);
|
||||
}
|
||||
channel.force(false);
|
||||
}
|
||||
}
|
||||
|
||||
public long getSize() {
|
||||
if (size < 0) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void force() {
|
||||
for (var page : pages) {
|
||||
page.force();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice) throws IOException {
|
||||
for (var page : pages) {
|
||||
page.advice(advice);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
|
||||
delegateToEachPageIO(start, end, (a,s,e) -> a.advice(advice, s, e));
|
||||
}
|
||||
|
||||
|
||||
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
|
||||
assert arrayEnd >= arrayStart;
|
||||
|
||||
int page = partitioningScheme.getPage(arrayStart);
|
||||
|
||||
long endPos;
|
||||
|
||||
for (long pos = arrayStart; pos < arrayEnd; pos = endPos) {
|
||||
endPos = partitioningScheme.getPageEnd(pos, arrayEnd);
|
||||
|
||||
int sOff = partitioningScheme.getOffset(pos);
|
||||
int eOff = partitioningScheme.getEndOffset(pos, endPos);
|
||||
|
||||
pages[page++].transferFrom(source, sourceStart, sOff, eOff);
|
||||
|
||||
sourceStart+=(endPos - pos);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,498 @@
|
||||
package nu.marginalia.util.array.page;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.array.delegate.ReferenceImplLongArrayDelegate;
|
||||
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
||||
import nu.marginalia.util.array.functional.LongIOTransformer;
|
||||
import nu.marginalia.util.array.functional.LongLongConsumer;
|
||||
import nu.marginalia.util.array.functional.LongTransformer;
|
||||
import nu.marginalia.util.array.functor.LongIOFolder;
|
||||
import nu.marginalia.util.array.scheme.ArrayPartitioningScheme;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class PagingLongArray extends AbstractPagingArray<LongArrayPage, LongBuffer> implements LongArray {
|
||||
private final ReferenceImplLongArrayDelegate defaults;
|
||||
|
||||
private PagingLongArray(ArrayPartitioningScheme partitioningScheme, LongArrayPage[] pages, long size) {
|
||||
super(partitioningScheme, pages, size);
|
||||
defaults = new ReferenceImplLongArrayDelegate(this);
|
||||
}
|
||||
|
||||
public static LongArray newOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) {
|
||||
return newPartitionedOnHeap(partitioningScheme, cardinality);
|
||||
}
|
||||
|
||||
public static LongArray newPartitionedOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) {
|
||||
LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(cardinality)];
|
||||
|
||||
for (int i = 0; i < pages.length; i++) {
|
||||
pages[i] = LongArrayPage.onHeap(partitioningScheme.getRequiredPageSize(i, cardinality));
|
||||
}
|
||||
|
||||
return new PagingLongArray(partitioningScheme, pages, cardinality);
|
||||
}
|
||||
|
||||
public static PagingLongArray mapFileReadOnly(ArrayPartitioningScheme partitioningScheme, Path file)
|
||||
throws IOException
|
||||
{
|
||||
long sizeBytes = Files.size(file);
|
||||
assert sizeBytes % WORD_SIZE == 0;
|
||||
|
||||
long size = sizeBytes / WORD_SIZE;
|
||||
|
||||
LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(size)];
|
||||
long offset = 0;
|
||||
for (int i = 0; i < pages.length; i++) {
|
||||
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
|
||||
pages[i] = LongArrayPage.fromMmapReadOnly(file, offset, partitionSize);
|
||||
offset += partitionSize;
|
||||
}
|
||||
|
||||
return new PagingLongArray(partitioningScheme, pages, size);
|
||||
}
|
||||
|
||||
public static PagingLongArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file)
|
||||
throws IOException
|
||||
{
|
||||
long sizeBytes = Files.size(file);
|
||||
assert sizeBytes % WORD_SIZE == 0;
|
||||
|
||||
long size = sizeBytes / WORD_SIZE;
|
||||
|
||||
LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(size)];
|
||||
long offset = 0;
|
||||
for (int i = 0; i < pages.length; i++) {
|
||||
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
|
||||
pages[i] = LongArrayPage.fromMmapReadWrite(file, offset, partitionSize);
|
||||
offset += partitionSize;
|
||||
}
|
||||
|
||||
return new PagingLongArray(partitioningScheme, pages, size);
|
||||
}
|
||||
|
||||
public static PagingLongArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file, long size)
|
||||
throws IOException
|
||||
{
|
||||
LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(size)];
|
||||
long offset = 0;
|
||||
for (int i = 0; i < pages.length; i++) {
|
||||
int partitionSize = partitioningScheme.getRequiredPageSize(i, size);
|
||||
pages[i] = LongArrayPage.fromMmapReadWrite(file, offset, partitionSize);
|
||||
offset += partitionSize;
|
||||
}
|
||||
|
||||
return new PagingLongArray(partitioningScheme, pages, size);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long get(long pos) {
|
||||
int page = partitioningScheme.getPage(pos);
|
||||
int offset = partitioningScheme.getOffset(pos);
|
||||
|
||||
try {
|
||||
return pages[page].get(partitioningScheme.getOffset(pos));
|
||||
}
|
||||
catch (IndexOutOfBoundsException ex) {
|
||||
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long start, long end, long[] buffer) {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
int sOff = partitioningScheme.getOffset(start);
|
||||
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||
|
||||
pages[partitioningScheme.getPage(start)].get(sOff, eOff, buffer);
|
||||
}
|
||||
else {
|
||||
defaults.get(start, end, buffer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long pos, long value) {
|
||||
int page = partitioningScheme.getPage(pos);
|
||||
int offset = partitioningScheme.getOffset(pos);
|
||||
try {
|
||||
pages[page].set(offset, value);
|
||||
}
|
||||
catch (IndexOutOfBoundsException ex) {
|
||||
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void increment(long pos) {
|
||||
int page = partitioningScheme.getPage(pos);
|
||||
int offset = partitioningScheme.getOffset(pos);
|
||||
|
||||
try {
|
||||
pages[page].increment(partitioningScheme.getOffset(pos));
|
||||
}
|
||||
catch (IndexOutOfBoundsException ex) {
|
||||
throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEach(long start, long end, LongLongConsumer transformer) {
|
||||
delegateToEachPage(start, end, (page, s, e) -> page.forEach(s, e, transformer));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fill(long fromIndex, long toIndex, long value) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
pages[partitioningScheme.getPage(fromIndex)].fill(sOff, eOff, value);
|
||||
}
|
||||
else {
|
||||
delegateToEachPage(fromIndex, toIndex, (page, s, e) -> page.fill(s, e, value));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transformEach(long start, long end, LongTransformer transformer) {
|
||||
delegateToEachPage(start, end, (page, s, e) -> page.transformEach(s, e, transformer));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transformEachIO(long start, long end, LongIOTransformer transformer) throws IOException {
|
||||
delegateToEachPageIO(start, end, (page, s, e) -> page.transformEachIO(s, e, transformer));
|
||||
}
|
||||
|
||||
@Override
|
||||
public long foldIO(long zero, long start, long end, LongBinaryIOOperation operator) throws IOException {
|
||||
var folder = new LongIOFolder(zero, operator);
|
||||
|
||||
delegateToEachPageIO(start, end, folder);
|
||||
|
||||
return folder.acc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long linearSearch(long key, long fromIndex, long toIndex) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearch(key, sOff, eOff);
|
||||
|
||||
return translateSearchResultsFromPage(fromIndex, ret);
|
||||
}
|
||||
else {
|
||||
return defaults.linearSearch(key, fromIndex, toIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long linearSearchN(int sz, long key, long fromIndex, long toIndex) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearchN(sz, key, sOff, eOff);
|
||||
|
||||
return translateSearchResultsFromPage(fromIndex, ret);
|
||||
}
|
||||
else {
|
||||
return defaults.linearSearchN(sz, key, fromIndex, toIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(long key, long fromIndex, long toIndex) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearch(key, sOff, eOff);
|
||||
|
||||
return translateSearchResultsFromPage(fromIndex, ret);
|
||||
}
|
||||
else {
|
||||
return defaults.binarySearch(key, fromIndex, toIndex);
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public long binarySearchN(int sz, long key, long fromIndex, long toIndex) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchN(sz, key, sOff, eOff);
|
||||
|
||||
return translateSearchResultsFromPage(fromIndex, ret);
|
||||
}
|
||||
else {
|
||||
return defaults.binarySearchN(sz, key, fromIndex, toIndex);
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public long binarySearchUpperBound(long key, long fromIndex, long toIndex) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchUpperBound(key, sOff, eOff);
|
||||
|
||||
return translateSearchResultsFromPage(fromIndex, ret);
|
||||
}
|
||||
else {
|
||||
return defaults.binarySearchUpperBound(key, fromIndex, toIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long linearSearchUpperBound(long key, long fromIndex, long toIndex) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearchUpperBound(key, sOff, eOff);
|
||||
|
||||
return translateSearchResultsFromPage(fromIndex, ret);
|
||||
}
|
||||
else {
|
||||
return defaults.linearSearchUpperBound(key, fromIndex, toIndex);
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public long binarySearchUpperBoundN(int sz, long key, long fromIndex, long toIndex) {
|
||||
if (partitioningScheme.isSamePage(fromIndex, toIndex)) {
|
||||
int sOff = partitioningScheme.getOffset(fromIndex);
|
||||
int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex);
|
||||
|
||||
long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchUpperBoundN(sz, key, sOff, eOff);
|
||||
|
||||
return translateSearchResultsFromPage(fromIndex, ret);
|
||||
}
|
||||
else {
|
||||
return defaults.binarySearchUpperBoundN(sz, key, fromIndex, toIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void retain(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
|
||||
int sOff = partitioningScheme.getOffset(searchStart);
|
||||
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(searchStart)].retain(buffer, boundary, sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.retain(buffer, boundary, searchStart, searchEnd);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void retainN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
|
||||
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
|
||||
int sOff = partitioningScheme.getOffset(searchStart);
|
||||
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(searchStart)].retainN(buffer, sz, boundary, sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.retainN(buffer, sz, boundary, searchStart, searchEnd);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reject(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
|
||||
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
|
||||
int sOff = partitioningScheme.getOffset(searchStart);
|
||||
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(searchStart)].reject(buffer, boundary, sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.reject(buffer, boundary, searchStart, searchEnd);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void rejectN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) {
|
||||
if (partitioningScheme.isSamePage(searchStart, searchEnd)) {
|
||||
int sOff = partitioningScheme.getOffset(searchStart);
|
||||
int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(searchStart)].rejectN(buffer, sz, boundary, sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.rejectN(buffer, sz, boundary, searchStart, searchEnd);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void insertionSort(long start, long end) {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
int sOff = partitioningScheme.getOffset(start);
|
||||
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(start)].insertionSort(sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.insertionSort(start, end);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void insertionSortN(int sz, long start, long end) {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
int sOff = partitioningScheme.getOffset(start);
|
||||
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(start)].insertionSortN(sz, sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.insertionSortN(sz, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void quickSort(long start, long end) {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
int sOff = partitioningScheme.getOffset(start);
|
||||
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(start)].quickSort(sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.quickSort(start, end);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void quickSortN(int sz, long start, long end) {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
int sOff = partitioningScheme.getOffset(start);
|
||||
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(start)].quickSortN(sz, sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.quickSortN(sz, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mergeSort(long start, long end, Path tempDir) throws IOException {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
int sOff = partitioningScheme.getOffset(start);
|
||||
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(start)].mergeSort(sOff, eOff, tempDir);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.mergeSort(start, end, tempDir);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void mergeSortN(int sz, long start, long end, Path tempDir) throws IOException {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
int sOff = partitioningScheme.getOffset(start);
|
||||
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(start)].mergeSortN(sz, sOff, eOff, tempDir);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.mergeSortN(sz, start, end, tempDir);
|
||||
}
|
||||
}
|
||||
|
||||
public void write(Path fileName) throws IOException {
|
||||
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
||||
for (int i = 0; i < pages.length; i++) {
|
||||
pages[i].write(channel);
|
||||
}
|
||||
channel.force(false);
|
||||
}
|
||||
}
|
||||
|
||||
public long getSize() {
|
||||
if (size < 0) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void force() {
|
||||
for (var page : pages) {
|
||||
page.force();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice) throws IOException {
|
||||
for (var page : pages) {
|
||||
page.advice(advice);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void advice(NativeIO.Advice advice, long start, long end) throws IOException {
|
||||
delegateToEachPageIO(start, end, (a,s,e) -> a.advice(advice, s, e));
|
||||
}
|
||||
|
||||
|
||||
public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException {
|
||||
assert arrayEnd >= arrayStart;
|
||||
|
||||
int page = partitioningScheme.getPage(arrayStart);
|
||||
|
||||
long endPos;
|
||||
|
||||
for (long pos = arrayStart; pos < arrayEnd; pos = endPos) {
|
||||
endPos = partitioningScheme.getPageEnd(pos, arrayEnd);
|
||||
|
||||
int sOff = partitioningScheme.getOffset(pos);
|
||||
int eOff = partitioningScheme.getEndOffset(pos, endPos);
|
||||
|
||||
pages[page++].transferFrom(source, sourceStart, sOff, eOff);
|
||||
|
||||
sourceStart+=(endPos - pos);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
package nu.marginalia.util.array.page;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
public interface PartitionPage {
|
||||
|
||||
default void write(FileChannel channel) throws IOException {
|
||||
var byteBuffer = getByteBuffer();
|
||||
|
||||
byteBuffer.clear();
|
||||
|
||||
while (byteBuffer.position() < byteBuffer.limit()) {
|
||||
channel.write(byteBuffer);
|
||||
}
|
||||
|
||||
byteBuffer.clear();
|
||||
}
|
||||
|
||||
ByteBuffer getByteBuffer();
|
||||
}
|
@ -0,0 +1,51 @@
|
||||
package nu.marginalia.util.array.scheme;
|
||||
|
||||
public interface ArrayPartitioningScheme {
|
||||
|
||||
static ArrayPartitioningScheme forPartitionSize(int size) {
|
||||
if (Integer.highestOneBit(size) == size) {
|
||||
return new PowerOf2PartitioningScheme(size);
|
||||
}
|
||||
else {
|
||||
return new SequentialPartitioningScheme(size);
|
||||
}
|
||||
}
|
||||
static int getRequiredPartitions(long cardinality, int partitionSize) {
|
||||
return (int) (cardinality / partitionSize + Long.signum(cardinality % partitionSize));
|
||||
}
|
||||
|
||||
|
||||
int getPartitions(long cardinality);
|
||||
|
||||
int getPage(long at);
|
||||
|
||||
boolean isSamePage(long a, long b);
|
||||
|
||||
/** Get the page offset corresponding to at */
|
||||
int getOffset(long at);
|
||||
|
||||
/** Variant of getOffset that doesn't wrap around the page boundary, necessary when
|
||||
* translating an exclusive end offset that getOffset(...) will translate to 0 and consider
|
||||
* part of the next page.
|
||||
*
|
||||
* It is also necessary to consider the start offset to determine when the end offset
|
||||
*
|
||||
*/
|
||||
default int getEndOffset(long start, long end) {
|
||||
if (end == 0 || end <= start)
|
||||
return getOffset(end);
|
||||
|
||||
return 1 + getOffset(end - 1);
|
||||
}
|
||||
|
||||
/** Get the end of the buffer containing at, or endTotal, whichever is smaller
|
||||
*/
|
||||
long getPageEnd(long at, long endTotal);
|
||||
|
||||
/**
|
||||
* toRealIndex(getBuffer(val), getOffset(val)) = val
|
||||
*/
|
||||
long toRealIndex(int buffer, int offset);
|
||||
|
||||
int getRequiredPageSize(int buffer, long cardinality);
|
||||
}
|
@ -0,0 +1,60 @@
|
||||
package nu.marginalia.util.array.scheme;
|
||||
|
||||
public class PowerOf2PartitioningScheme implements ArrayPartitioningScheme {
|
||||
final int partitionSize;
|
||||
final long offsetMask;
|
||||
final long bufferMask;
|
||||
final int pageShift;
|
||||
|
||||
public PowerOf2PartitioningScheme(int partitionSize) {
|
||||
assert partitionSize == Integer.highestOneBit(partitionSize);
|
||||
|
||||
this.partitionSize = partitionSize;
|
||||
|
||||
offsetMask = partitionSize - 1;
|
||||
bufferMask = ~offsetMask;
|
||||
pageShift = Integer.numberOfTrailingZeros(partitionSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPartitions(long cardinality) {
|
||||
return ArrayPartitioningScheme.getRequiredPartitions(cardinality, partitionSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPage(long at) { // very hot code
|
||||
return (int) (at >>> pageShift);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOffset(long at) { // very hot code
|
||||
return (int) (at & offsetMask);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSamePage(long a, long b) { // hot code
|
||||
return 0 == ((a ^ b) & bufferMask);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getPageEnd(long at, long endTotal) {
|
||||
return Math.min(endTotal, partitionSize * (1L + getPage(at)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public long toRealIndex(int buffer, int offset) {
|
||||
return offset + (long) buffer * partitionSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getRequiredPageSize(int buffer, long cardinality) {
|
||||
|
||||
if ((long) (1 + buffer) * partitionSize <= cardinality) {
|
||||
return partitionSize;
|
||||
}
|
||||
|
||||
return (int) (cardinality % partitionSize);
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,56 @@
|
||||
package nu.marginalia.util.array.scheme;
|
||||
|
||||
public class SequentialPartitioningScheme implements ArrayPartitioningScheme {
|
||||
|
||||
final int partitionSize;
|
||||
|
||||
public SequentialPartitioningScheme(int partitionSize) {
|
||||
this.partitionSize = partitionSize;
|
||||
}
|
||||
|
||||
public static int getRequiredPartitions(long cardinality, int partitionSize) {
|
||||
return (int) (cardinality / partitionSize + Long.signum(cardinality % partitionSize));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPartitions(long cardinality) {
|
||||
return getRequiredPartitions(cardinality, partitionSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPage(long at) {
|
||||
return (int) (at / partitionSize);
|
||||
}
|
||||
|
||||
public long getPageEnd(long at, long endTotal) {
|
||||
return Math.min(endTotal, partitionSize * (1L + getPage(at)));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isSamePage(long a, long b) {
|
||||
return (int) (a / partitionSize) == (int)(b/partitionSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOffset(long at) {
|
||||
return (int) (at % partitionSize);
|
||||
}
|
||||
|
||||
public long toRealIndex(int buffer, int offset) {
|
||||
return offset + (long) buffer * partitionSize;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public int getRequiredPageSize(int buffer, long cardinality) {
|
||||
|
||||
if ((long) (1 + buffer) * partitionSize <= cardinality) {
|
||||
return partitionSize;
|
||||
}
|
||||
return (int) (cardinality % partitionSize);
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
package nu.marginalia.util.array.trace;
|
||||
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.Optional;
|
||||
|
||||
public interface ArrayTrace {
|
||||
void touch(long address);
|
||||
void touch(long start, long end);
|
||||
|
||||
FileTrace fileTrace = Optional.ofNullable(System.clearProperty("nu.marginalia.util.array.trace")).map(Path::of).map(FileTrace::new).orElseGet(FileTrace::new);
|
||||
NullTrace nullTrace = new NullTrace();
|
||||
static ArrayTrace get(LongArray array) {
|
||||
|
||||
if (fileTrace == null) {
|
||||
return nullTrace;
|
||||
}
|
||||
|
||||
return fileTrace.forArray(array);
|
||||
}
|
||||
}
|
@ -0,0 +1,115 @@
|
||||
package nu.marginalia.util.array.trace;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import static java.awt.image.BufferedImage.TYPE_INT_RGB;
|
||||
|
||||
public class ArrayTraceViz {
|
||||
|
||||
|
||||
private static final int BLOCK_SIZE_WORDS = 512;
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
Path inputFile = Path.of("/home/vlofgren/array-trace.log");
|
||||
Map<Integer, Integer> sizes = new HashMap<>();
|
||||
Map<Integer, Set<Integer>> rows = new HashMap<>();
|
||||
|
||||
try (var lines = Files.lines(inputFile)) {
|
||||
lines.map(line -> line.split("\\s")).forEach(parts -> {
|
||||
int block = Integer.parseInt(parts[1]);
|
||||
int start = Integer.parseInt(parts[2]);
|
||||
int end = Integer.parseInt(parts[3]);
|
||||
|
||||
sizes.merge(block, end, Integer::max);
|
||||
|
||||
var rowSet = rows.computeIfAbsent(block, b -> new HashSet<>());
|
||||
for (int b = start; b < end; b += BLOCK_SIZE_WORDS) {
|
||||
rowSet.add(b/BLOCK_SIZE_WORDS);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Map<Integer, Map<Integer, Integer>> rowToY = new HashMap<>();
|
||||
|
||||
rows.forEach((row, vals) -> {
|
||||
var map = new HashMap<Integer, Integer>(vals.size());
|
||||
rowToY.put(row, map);
|
||||
var list = new ArrayList<>(vals);
|
||||
|
||||
list.stream().sorted().forEach(val -> map.put(val, map.size()));
|
||||
});
|
||||
|
||||
Map<Integer, Integer> cols = new HashMap<>();
|
||||
sizes.keySet().forEach(key -> cols.put(key, cols.size()));
|
||||
|
||||
int width = cols.size() * (BLOCK_SIZE_WORDS+4);
|
||||
int height = 640;
|
||||
|
||||
var bi = new BufferedImage(width, height, TYPE_INT_RGB);
|
||||
|
||||
AtomicInteger iv = new AtomicInteger();
|
||||
|
||||
try (var lines = Files.lines(inputFile)) {
|
||||
lines.forEach(line -> {
|
||||
String[] parts = line.split("\\s");
|
||||
|
||||
long time = Long.parseLong(parts[0]);
|
||||
int block = Integer.parseInt(parts[1]);
|
||||
int start = Integer.parseInt(parts[2]);
|
||||
int end = Integer.parseInt(parts[3]);
|
||||
|
||||
for (int p = start; p < end; p++) {
|
||||
int x0 = (4+BLOCK_SIZE_WORDS) * cols.get(block);
|
||||
int x = x0 + (p%BLOCK_SIZE_WORDS);
|
||||
int y = rowToY.get(block).get(p/BLOCK_SIZE_WORDS);
|
||||
|
||||
if (y >= 640) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (0 == bi.getRGB(x, y)) {
|
||||
for (int x2 = 0; x2 < BLOCK_SIZE_WORDS; x2++) {
|
||||
if (0 == bi.getRGB(x0 + x2, y)) {
|
||||
bi.setRGB(x0 + x2, y, 0xC0C0C0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println(x + "," + y);
|
||||
bi.setRGB(x, y, (int) (0xFFFFFFL));
|
||||
}
|
||||
|
||||
try {
|
||||
if ((iv.incrementAndGet() % 4) == 0) {
|
||||
ImageIO.write(bi, "png", new File("/tmp/test" + (time * Long.signum(time)) + " .png"));
|
||||
for (int x = 0; x < width; x++) {
|
||||
for (int y = 0; y < height; y++) {
|
||||
int val = bi.getRGB(x, y);
|
||||
int nval = (val&0xFF) - 1;
|
||||
if (nval > 64) {
|
||||
bi.setRGB(x, y, nval | (nval<<8) | (nval << 16));
|
||||
}
|
||||
else if ((val&0xFFFFFF) != 0) {
|
||||
bi.setRGB(x, y, 64);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
record ArrayPage(int id, int size) {}
|
||||
}
|
@ -0,0 +1,52 @@
|
||||
package nu.marginalia.util.array.trace;
|
||||
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class FileTrace {
|
||||
PrintStream traceWriter;
|
||||
static volatile boolean doTrace = false;
|
||||
|
||||
public FileTrace(Path file) {
|
||||
try {
|
||||
traceWriter = new PrintStream(Files.newOutputStream(file, StandardOpenOption.WRITE, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
|
||||
} catch (IOException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public FileTrace() {
|
||||
this(Path.of("/tmp/array-trace.log"));
|
||||
}
|
||||
|
||||
public static void setTrace(boolean val) {
|
||||
doTrace = val;
|
||||
}
|
||||
|
||||
public void trace(int source, long start, long end) {
|
||||
if (doTrace) {
|
||||
traceWriter.printf("%d %d %d %d\n", System.nanoTime(), source, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
public ArrayTrace forArray(LongArray array) {
|
||||
return new ArrayTrace() {
|
||||
final int code = array.hashCode();
|
||||
|
||||
@Override
|
||||
public void touch(long address) {
|
||||
trace(code, address, address+1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void touch(long start, long end) {
|
||||
trace(code, start, end);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
@ -0,0 +1,11 @@
|
||||
package nu.marginalia.util.array.trace;
|
||||
|
||||
public class NullTrace implements ArrayTrace {
|
||||
|
||||
@Override
|
||||
public void touch(long address) {}
|
||||
|
||||
@Override
|
||||
public void touch(long start, long end) {}
|
||||
|
||||
}
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
|
||||
/*
|
||||
* End-of-page mark that's used as a sentinel to verify that
|
||||
@ -12,14 +12,16 @@ import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
*/
|
||||
public class BTreeDogEar {
|
||||
|
||||
private MultimapFileLongSlice sentinelSlice;
|
||||
private LongArray sentinelSlice;
|
||||
|
||||
public BTreeDogEar(BTreeContext ctx, BTreeHeader header, MultimapFileLongSlice base) {
|
||||
public BTreeDogEar(BTreeContext ctx, BTreeHeader header, LongArray base) {
|
||||
if (header.numEntries() > 3) {
|
||||
sentinelSlice = base.atOffset((long) header.numEntries() * ctx.entrySize() - 3);
|
||||
sentinelSlice.put(0, 4L);
|
||||
sentinelSlice.put(1, 5L);
|
||||
sentinelSlice.put(2, 1L);
|
||||
sentinelSlice = base.range(
|
||||
(long) header.numEntries() * ctx.entrySize() - 3,
|
||||
(long) header.numEntries() * ctx.entrySize());
|
||||
sentinelSlice.set(0, 4L);
|
||||
sentinelSlice.set(1, 5L);
|
||||
sentinelSlice.set(2, 1L);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,43 +1,39 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongLongImmutablePair;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.array.algo.LongArraySearch;
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.array.delegate.ShiftedLongArray;
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.multimap.MultimapSearcher;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class BTreeReader {
|
||||
public class BTreeReader implements BTreeReaderIf {
|
||||
|
||||
private final LongArray index;
|
||||
private final ShiftedLongArray data;
|
||||
|
||||
private final MultimapFileLong file;
|
||||
public final BTreeContext ctx;
|
||||
|
||||
private final MultimapSearcher indexSearcher;
|
||||
private final MultimapSearcher dataSearcher;
|
||||
private final BTreeHeader header;
|
||||
|
||||
public BTreeReader(MultimapFileLong file, BTreeContext ctx, BTreeHeader header) {
|
||||
this.file = file;
|
||||
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
|
||||
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
||||
|
||||
this.ctx = ctx;
|
||||
this.header = header;
|
||||
}
|
||||
|
||||
public BTreeReader(MultimapFileLong file, BTreeContext ctx, long offset) {
|
||||
this.file = file;
|
||||
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
|
||||
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
||||
private final long dataBlockEnd;
|
||||
|
||||
public BTreeReader(LongArray file, BTreeContext ctx, long offset) {
|
||||
this.ctx = ctx;
|
||||
this.header = createHeader(file, offset);
|
||||
|
||||
dataBlockEnd = (long) ctx.entrySize() * header.numEntries();
|
||||
index = file.range(header.indexOffsetLongs(), header.dataOffsetLongs());
|
||||
data = file.range(header.dataOffsetLongs(), header.dataOffsetLongs() + dataBlockEnd);
|
||||
|
||||
}
|
||||
|
||||
public static BTreeHeader createHeader(MultimapFileLong file, long fileOffset) {
|
||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
||||
public static BTreeHeader createHeader(LongArray file, long fileOffset) {
|
||||
long[] parts = new long[3];
|
||||
file.get(fileOffset, fileOffset+3, parts);
|
||||
return new BTreeHeader(parts[0], parts[1], parts[2]);
|
||||
}
|
||||
|
||||
public BTreeHeader getHeader() {
|
||||
@ -49,7 +45,7 @@ public class BTreeReader {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void retainEntries(BTreeQueryBuffer buffer) {
|
||||
public void retainEntries(LongQueryBuffer buffer) {
|
||||
if (header.layers() == 0) {
|
||||
BTreePointer pointer = new BTreePointer(header);
|
||||
while (buffer.hasMore()) {
|
||||
@ -60,7 +56,7 @@ public class BTreeReader {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void rejectEntries(BTreeQueryBuffer buffer) {
|
||||
public void rejectEntries(LongQueryBuffer buffer) {
|
||||
if (header.layers() == 0) {
|
||||
BTreePointer pointer = new BTreePointer(header);
|
||||
while (buffer.hasMore()) {
|
||||
@ -70,13 +66,13 @@ public class BTreeReader {
|
||||
rejectSingle(buffer);
|
||||
}
|
||||
|
||||
private void retainSingle(BTreeQueryBuffer buffer) {
|
||||
private void retainSingle(LongQueryBuffer buffer) {
|
||||
|
||||
BTreePointer pointer = new BTreePointer(header);
|
||||
|
||||
for (; buffer.hasMore(); pointer.resetToRoot()) {
|
||||
|
||||
long val = buffer.currentValue() & ctx.equalityMask();
|
||||
long val = buffer.currentValue();
|
||||
|
||||
if (!pointer.walkToData(val)) {
|
||||
buffer.rejectAndAdvance();
|
||||
@ -87,12 +83,12 @@ public class BTreeReader {
|
||||
}
|
||||
}
|
||||
|
||||
private void rejectSingle(BTreeQueryBuffer buffer) {
|
||||
private void rejectSingle(LongQueryBuffer buffer) {
|
||||
BTreePointer pointer = new BTreePointer(header);
|
||||
|
||||
for (; buffer.hasMore(); pointer.resetToRoot()) {
|
||||
|
||||
long val = buffer.currentValue() & ctx.equalityMask();
|
||||
long val = buffer.currentValue();
|
||||
|
||||
if (pointer.walkToData(val) && pointer.containsData(val)) {
|
||||
buffer.rejectAndAdvance();
|
||||
@ -108,31 +104,53 @@ public class BTreeReader {
|
||||
*
|
||||
* @return file offset of entry matching keyRaw, negative if absent
|
||||
*/
|
||||
public long findEntry(final long keyRaw) {
|
||||
final long key = keyRaw & ctx.equalityMask();
|
||||
|
||||
public long findEntry(final long key) {
|
||||
BTreePointer ip = new BTreePointer(header);
|
||||
|
||||
while (!ip.isDataLayer())
|
||||
ip.walkToChild(key);
|
||||
if (!ip.walkToChild(key))
|
||||
return -1;
|
||||
|
||||
return ip.findData(key);
|
||||
}
|
||||
|
||||
public void readData(long[] data, int n, long pos) {
|
||||
file.read(data, n, header.dataOffsetLongs() + pos);
|
||||
public void readData(long[] buf, int n, long pos) {
|
||||
data.get(pos, pos + n, buf);
|
||||
}
|
||||
|
||||
public long[] queryData(long[] urls, int offset) {
|
||||
public long[] queryData(long[] keys, int offset) {
|
||||
BTreePointer pointer = new BTreePointer(header);
|
||||
|
||||
long[] ret = new long[urls.length];
|
||||
long[] ret = new long[keys.length];
|
||||
|
||||
for (int i = 0; i < urls.length; i++, pointer.resetToRoot()) {
|
||||
if (pointer.walkToData(urls[i])) {
|
||||
long dataAddress = pointer.findData(urls[i]);
|
||||
if (dataAddress >= 0) {
|
||||
ret[i] = file.get(dataAddress + offset);
|
||||
// this function could be re-written like retain() and would be
|
||||
// much faster
|
||||
|
||||
if (header.layers() == 0) {
|
||||
long searchStart = 0;
|
||||
for (int i = 0; i < keys.length; i++) {
|
||||
long key = keys[i];
|
||||
searchStart = data.binarySearchN(ctx.entrySize(), key, searchStart, data.size);
|
||||
if (searchStart < 0) {
|
||||
searchStart = LongArraySearch.decodeSearchMiss(searchStart);
|
||||
}
|
||||
else {
|
||||
ret[i] = data.get(searchStart + offset);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
for (int i = 0; i < keys.length; i++) {
|
||||
if (i > 0) {
|
||||
pointer.resetToRoot();
|
||||
}
|
||||
|
||||
if (pointer.walkToData(keys[i])) {
|
||||
long dataAddress = pointer.findData(keys[i]);
|
||||
if (dataAddress >= 0) {
|
||||
ret[i] = data.get(dataAddress + offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -140,25 +158,6 @@ public class BTreeReader {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Find the range of values so that prefixStart <= n < prefixNext */
|
||||
public LongLongImmutablePair getRangeForPrefix(long prefixStart, long prefixNext) {
|
||||
long lowerBoundStart = lowerBound(prefixStart);
|
||||
long lowerBoundEnd = lowerBound(prefixNext);
|
||||
|
||||
return new LongLongImmutablePair(lowerBoundStart, lowerBoundEnd);
|
||||
}
|
||||
|
||||
private long lowerBound(long key) {
|
||||
key &= ctx.equalityMask();
|
||||
|
||||
BTreePointer ip = new BTreePointer(header);
|
||||
|
||||
while (!ip.isDataLayer())
|
||||
ip.walkToChild(key);
|
||||
|
||||
return ip.findDataLower(key);
|
||||
}
|
||||
|
||||
private class BTreePointer {
|
||||
private final long[] layerOffsets;
|
||||
|
||||
@ -190,18 +189,13 @@ public class BTreeReader {
|
||||
}
|
||||
|
||||
public boolean walkToChild(long key) {
|
||||
final long indexAddress = header.indexOffsetLongs();
|
||||
|
||||
final long indexLayerBlockOffset = layerOffsets[layer] + offset;
|
||||
final long searchStart = layerOffsets[layer] + offset;
|
||||
|
||||
final long searchStart = indexAddress + indexLayerBlockOffset;
|
||||
final long nextLayerOffset = (int)(indexSearcher.binarySearchLower(key, searchStart, ctx.BLOCK_SIZE_WORDS()) - searchStart);
|
||||
|
||||
if (nextLayerOffset < 0)
|
||||
return false;
|
||||
final long nextLayerOffset = (int) index.binarySearchUpperBound(key, searchStart, searchStart + ctx.BLOCK_SIZE_WORDS()) - searchStart;
|
||||
|
||||
layer --;
|
||||
boundary = file.get(searchStart + offset);
|
||||
boundary = index.get(searchStart + nextLayerOffset);
|
||||
offset = ctx.BLOCK_SIZE_WORDS() * (offset + nextLayerOffset);
|
||||
|
||||
return true;
|
||||
@ -225,41 +219,39 @@ public class BTreeReader {
|
||||
}
|
||||
|
||||
public long findData(long key) {
|
||||
if (layer > 0) {
|
||||
if (layer >= 0) {
|
||||
throw new IllegalStateException("Looking for data in an index layer");
|
||||
}
|
||||
|
||||
long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||
int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
|
||||
long searchStart = offset * ctx.entrySize();
|
||||
long remainingTotal = dataBlockEnd - offset * ctx.entrySize();
|
||||
long remainingBlock;
|
||||
|
||||
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
||||
remainingBlock = (layerOffsets.length == 0)
|
||||
? remainingTotal
|
||||
: (long) ctx.BLOCK_SIZE_WORDS() * ctx.entrySize();
|
||||
|
||||
long searchEnd = searchStart + (int) min(remainingTotal, remainingBlock);
|
||||
|
||||
return data.binarySearchN(ctx.entrySize(), key, searchStart, searchEnd);
|
||||
}
|
||||
|
||||
public long findDataLower(long key) {
|
||||
if (layer > 0) {
|
||||
throw new IllegalStateException("Looking for data in an index layer");
|
||||
}
|
||||
|
||||
long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||
int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
|
||||
|
||||
return dataSearcher.binarySearchLower(key, searchStart, numEntries);
|
||||
}
|
||||
|
||||
public void retainData(BTreeQueryBuffer buffer) {
|
||||
public void retainData(LongQueryBuffer buffer) {
|
||||
|
||||
long dataOffset = findData(buffer.currentValue());
|
||||
if (dataOffset >= 0) {
|
||||
buffer.retainAndAdvance();
|
||||
|
||||
long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||
long relOffset = dataOffset - blockBase;
|
||||
if (buffer.hasMore() && buffer.currentValue() <= boundary) {
|
||||
long blockBase = offset * ctx.entrySize();
|
||||
long relOffset = dataOffset - blockBase;
|
||||
|
||||
int numEntries =
|
||||
min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
|
||||
long remainingTotal = dataBlockEnd - dataOffset;
|
||||
long remainingBlock = ctx.BLOCK_SIZE_WORDS() - relOffset;
|
||||
|
||||
if (buffer.currentValue() <= boundary) {
|
||||
file.retain(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
|
||||
long searchEnd = dataOffset + (int) min(remainingTotal, remainingBlock);
|
||||
|
||||
data.range(dataOffset, searchEnd).retainN(buffer, ctx.entrySize(), boundary);
|
||||
}
|
||||
}
|
||||
else {
|
||||
@ -268,20 +260,22 @@ public class BTreeReader {
|
||||
|
||||
}
|
||||
|
||||
public void rejectData(BTreeQueryBuffer buffer) {
|
||||
public void rejectData(LongQueryBuffer buffer) {
|
||||
|
||||
long dataOffset = findData(buffer.currentValue());
|
||||
if (dataOffset >= 0) {
|
||||
buffer.rejectAndAdvance();
|
||||
|
||||
long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||
long relOffset = dataOffset - blockBase;
|
||||
if (buffer.hasMore() && buffer.currentValue() <= boundary) {
|
||||
long blockBase = offset * ctx.entrySize();
|
||||
long relOffset = dataOffset - blockBase;
|
||||
|
||||
int numEntries =
|
||||
min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
|
||||
long remainingTotal = dataBlockEnd - dataOffset;
|
||||
long remainingBlock = ctx.BLOCK_SIZE_WORDS() - relOffset;
|
||||
|
||||
if (buffer.currentValue() <= boundary) {
|
||||
file.reject(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
|
||||
long searchEnd = dataOffset + (int) min(remainingTotal, remainingBlock);
|
||||
|
||||
data.range(dataOffset, searchEnd).rejectN(buffer, ctx.entrySize(), boundary);
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
@ -0,0 +1,21 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
|
||||
public interface BTreeReaderIf {
|
||||
BTreeHeader getHeader();
|
||||
|
||||
int numEntries();
|
||||
|
||||
void retainEntries(LongQueryBuffer buffer);
|
||||
|
||||
void rejectEntries(LongQueryBuffer buffer);
|
||||
|
||||
long findEntry(long keyRaw);
|
||||
|
||||
void readData(long[] data, int n, long pos);
|
||||
|
||||
long[] queryData(long[] urls, int offset);
|
||||
|
||||
}
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -11,10 +11,10 @@ import java.io.IOException;
|
||||
|
||||
public class BTreeWriter {
|
||||
private final BTreeContext ctx;
|
||||
private final MultimapFileLongSlice map;
|
||||
private final LongArray map;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
|
||||
public BTreeWriter(LongArray map, BTreeContext ctx) {
|
||||
this.map = map;
|
||||
this.ctx = ctx;
|
||||
}
|
||||
@ -42,8 +42,10 @@ public class BTreeWriter {
|
||||
|
||||
header.write(map, offset);
|
||||
|
||||
final long startRange = header.dataOffsetLongs();
|
||||
final long endRange = startRange + (long) numEntries * ctx.entrySize();
|
||||
|
||||
var slice = map.atOffset(header.dataOffsetLongs());
|
||||
var slice = map.range(startRange, endRange);
|
||||
|
||||
BTreeDogEar dogEar = new BTreeDogEar(ctx, header, slice);
|
||||
|
||||
@ -53,13 +55,11 @@ public class BTreeWriter {
|
||||
logger.error("Dog ear was not overwritten: {}", header);
|
||||
}
|
||||
|
||||
if (header.layers() < 1) { // The data is too small to benefit from indexing
|
||||
return ctx.calculateSize(numEntries);
|
||||
}
|
||||
else {
|
||||
if (header.layers() >= 1) { // Omit layer if data fits within a single block
|
||||
writeIndex(header);
|
||||
return ctx.calculateSize(numEntries);
|
||||
}
|
||||
|
||||
return ctx.calculateSize(numEntries);
|
||||
}
|
||||
|
||||
public static BTreeHeader makeHeader(BTreeContext ctx, long offset, int numEntries) {
|
||||
@ -96,7 +96,8 @@ public class BTreeWriter {
|
||||
|
||||
}
|
||||
|
||||
private void writeIndexLayer(BTreeHeader header, long[] layerOffsets,
|
||||
private void writeIndexLayer(BTreeHeader header,
|
||||
long[] layerOffsets,
|
||||
final long indexedDataStepSize,
|
||||
final int layer) {
|
||||
|
||||
@ -115,13 +116,20 @@ public class BTreeWriter {
|
||||
dataPtr += indexedDataStepSize)
|
||||
{
|
||||
long dataOffset = dataOffsetBase + (dataPtr + lastDataEntryOffset) * entrySize;
|
||||
map.put(indexOffsetBase + indexWord++, map.get(dataOffset) & ctx.equalityMask());
|
||||
map.set(indexOffsetBase + indexWord++, map.get(dataOffset));
|
||||
}
|
||||
|
||||
// Fill the remaining block with LONG_MAX
|
||||
map.setRange(indexOffsetBase+indexWord,
|
||||
(int) (ctx.BLOCK_SIZE_WORDS() - (indexWord % ctx.BLOCK_SIZE_WORDS())),
|
||||
Long.MAX_VALUE);
|
||||
// If the index block is not completely filled with data,
|
||||
// top up the remaining index block with LONG_MAX
|
||||
|
||||
final long trailerStart = indexOffsetBase + indexWord;
|
||||
final long trailerEnd = trailerStart
|
||||
+ ctx.BLOCK_SIZE_WORDS()
|
||||
- (int) (indexWord % ctx.BLOCK_SIZE_WORDS());
|
||||
|
||||
if (trailerStart < trailerEnd) {
|
||||
map.fill(trailerStart, trailerEnd, Long.MAX_VALUE);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface WriteCallback {
|
||||
void write(MultimapFileLongSlice slice) throws IOException;
|
||||
void write(LongArray slice) throws IOException;
|
||||
}
|
||||
|
@ -4,22 +4,28 @@ import nu.marginalia.util.btree.BTreeWriter;
|
||||
|
||||
public record BTreeContext(int MAX_LAYERS,
|
||||
int entrySize,
|
||||
long equalityMask,
|
||||
int BLOCK_SIZE_BITS,
|
||||
int BLOCK_SIZE_WORDS) {
|
||||
|
||||
public BTreeContext(int MAX_LAYERS, int entrySize, long equalityMask, int BLOCK_SIZE_BITS) {
|
||||
this(MAX_LAYERS, entrySize, equalityMask, BLOCK_SIZE_BITS, 1 << BLOCK_SIZE_BITS);
|
||||
// 8 pages is the breaking point where using a B-tree is actually advantageous
|
||||
// over just binary searching in a sorted list. Above 8 pages, binary search will
|
||||
// worst-case four page faults. A b-tree will incur three page faults up until
|
||||
// ~100k-200k entries with typical configurations.
|
||||
|
||||
private static final int MIN_PAGES_FOR_BTREE = 8;
|
||||
|
||||
public BTreeContext(int MAX_LAYERS, int entrySize, int BLOCK_SIZE_BITS) {
|
||||
this(MAX_LAYERS, entrySize, BLOCK_SIZE_BITS, 1 << BLOCK_SIZE_BITS);
|
||||
}
|
||||
|
||||
public long calculateSize(int numEntries) {
|
||||
var header = BTreeWriter.makeHeader(this, 0, numEntries);
|
||||
|
||||
return header.dataOffsetLongs() + (long)numEntries * entrySize;
|
||||
return header.dataOffsetLongs() + (long) numEntries * entrySize + 4;
|
||||
}
|
||||
|
||||
public int numIndexLayers(int numEntries) {
|
||||
if (numEntries <= BLOCK_SIZE_WORDS*2/entrySize) {
|
||||
if (numEntries <= BLOCK_SIZE_WORDS*MIN_PAGES_FOR_BTREE/entrySize) {
|
||||
return 0;
|
||||
}
|
||||
for (int i = 1; i < MAX_LAYERS; i++) {
|
||||
@ -37,12 +43,8 @@ public record BTreeContext(int MAX_LAYERS,
|
||||
|
||||
public long indexLayerSize(int numWords, int level) {
|
||||
final long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1));
|
||||
final long numBlocks = numWords / layerSize;
|
||||
|
||||
if (numWords % layerSize != 0) {
|
||||
return BLOCK_SIZE_WORDS * (numBlocks + 1);
|
||||
}
|
||||
return BLOCK_SIZE_WORDS * numBlocks;
|
||||
return BLOCK_SIZE_WORDS * (numWords / layerSize + Long.signum(numWords % layerSize));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.util.btree.model;
|
||||
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
|
||||
public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) {
|
||||
public BTreeHeader {
|
||||
@ -28,10 +28,10 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon
|
||||
return padding;
|
||||
}
|
||||
|
||||
public void write(MultimapFileLongSlice dest, long offset) {
|
||||
dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL));
|
||||
dest.put(offset+1, indexOffsetLongs);
|
||||
dest.put(offset+2, dataOffsetLongs);
|
||||
public void write(LongArray dest, long offset) {
|
||||
dest.set(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL));
|
||||
dest.set(offset+1, indexOffsetLongs);
|
||||
dest.set(offset+2, dataOffsetLongs);
|
||||
}
|
||||
|
||||
|
||||
|
@ -0,0 +1,6 @@
|
||||
package nu.marginalia.util.gregex;
|
||||
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public interface GuardedRegex extends Predicate<String> {
|
||||
}
|
@ -0,0 +1,62 @@
|
||||
package nu.marginalia.util.gregex;
|
||||
|
||||
import org.intellij.lang.annotations.Language;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
public class GuardedRegexFactory {
|
||||
|
||||
// Regular expressions are slow, even compiled ones. Guarding them with startsWith, or even contains
|
||||
// is something like an order of magnitude faster. This matters a lot in hot code.
|
||||
|
||||
public static GuardedRegex startsWith(String prefix, @Language("RegExp") String regex) {
|
||||
return new GuardedRegexStartsWith(prefix, regex);
|
||||
}
|
||||
public static GuardedRegex endsWith(String suffix, @Language("RegExp") String regex) {
|
||||
return new GuardedRegexEndsWith(suffix, regex);
|
||||
}
|
||||
public static GuardedRegex contains(String substring, @Language("RegExp") String regex) {
|
||||
return new GuardedRegexContains(substring, regex);
|
||||
}
|
||||
public static GuardedRegex minLength(int minLength, @Language("RegExp") String regex) {
|
||||
return new GuardedRegexMinLength(minLength, regex);
|
||||
}
|
||||
|
||||
private record GuardedRegexContains(String contains, Pattern pattern) implements GuardedRegex {
|
||||
public GuardedRegexContains(String contains, String pattern) {
|
||||
this(contains, Pattern.compile(pattern));
|
||||
}
|
||||
|
||||
public boolean test(String s) {
|
||||
return s.contains(contains) && pattern.matcher(s).find();
|
||||
}
|
||||
}
|
||||
private record GuardedRegexMinLength(int minLength, Pattern pattern) implements GuardedRegex {
|
||||
public GuardedRegexMinLength(int minLength, String pattern) {
|
||||
this(minLength, Pattern.compile(pattern));
|
||||
}
|
||||
|
||||
public boolean test(String s) {
|
||||
return s.length() >= minLength && pattern.matcher(s).find();
|
||||
}
|
||||
}
|
||||
private record GuardedRegexStartsWith(String start, Pattern pattern) implements GuardedRegex {
|
||||
public GuardedRegexStartsWith(String start, String pattern) {
|
||||
this(start, Pattern.compile(pattern));
|
||||
}
|
||||
|
||||
public boolean test(String s) {
|
||||
return s.startsWith(start) && pattern.matcher(s).find();
|
||||
}
|
||||
}
|
||||
private record GuardedRegexEndsWith(String end, Pattern pattern) implements GuardedRegex {
|
||||
public GuardedRegexEndsWith(String end, String pattern) {
|
||||
this(end, Pattern.compile(pattern));
|
||||
}
|
||||
|
||||
public boolean test(String s) {
|
||||
return s.endsWith(end) && pattern.matcher(s).find();
|
||||
}
|
||||
}
|
||||
}
|
@ -1,188 +0,0 @@
|
||||
package nu.marginalia.util.hash;
|
||||
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.PrimeUtil;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import static java.lang.Math.round;
|
||||
|
||||
/**
|
||||
* Spiritually influenced by GNU Trove's hash maps
|
||||
* LGPL 2.1
|
||||
*/
|
||||
public class LongPairHashMap {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class);
|
||||
private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police
|
||||
|
||||
private final long hashTableSize;
|
||||
private final MultimapFileLong data;
|
||||
private final long maxProbeLength;
|
||||
private int sz = 0;
|
||||
private static final int HEADER_SIZE = 2;
|
||||
|
||||
private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) {
|
||||
this.data = data;
|
||||
this.hashTableSize = hashTableSize;
|
||||
this.maxProbeLength = maxProbeLength;
|
||||
}
|
||||
|
||||
public static LongPairHashMap createNew(MultimapFileLong data, long size) {
|
||||
var tableSize = PrimeUtil.nextPrime(size, 1);
|
||||
var ret = new LongPairHashMap(data, tableSize, tableSize/2);
|
||||
|
||||
data.put(0, MAGIC_WORD);
|
||||
data.put(1, tableSize);
|
||||
|
||||
for (int i = 2; i < tableSize; i++) {
|
||||
data.put(HEADER_SIZE + 2L*i, 0);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static LongPairHashMap loadExisting(MultimapFileLong data) {
|
||||
long key = data.get(0);
|
||||
|
||||
if (key != MAGIC_WORD) {
|
||||
logger.warn("LongPairHashMap lacks magic word, could this be garbage data?");
|
||||
}
|
||||
|
||||
var hashTableSize = data.get(1);
|
||||
var maxProbeLength = hashTableSize / 10;
|
||||
|
||||
return new LongPairHashMap(data, hashTableSize, maxProbeLength);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return sz;
|
||||
}
|
||||
|
||||
private CellData getCell(long idx) {
|
||||
long bufferIdx = 2*idx + HEADER_SIZE;
|
||||
long a = data.get(bufferIdx);
|
||||
long b = data.get(bufferIdx+1);
|
||||
return new CellData(a, b);
|
||||
}
|
||||
private void setCell(long idx, CellData cell) {
|
||||
long bufferIdx = 2*idx + HEADER_SIZE;
|
||||
data.put(bufferIdx, cell.first);
|
||||
data.put(bufferIdx+1, cell.second);
|
||||
}
|
||||
|
||||
public CellData put(CellData data) {
|
||||
|
||||
long hash = longHash(data.getKey()) & 0x7FFF_FFFFL;
|
||||
|
||||
long idx = hash% hashTableSize;
|
||||
if (!getCell(hash% hashTableSize).isSet()) {
|
||||
return setValue(data, hash% hashTableSize);
|
||||
}
|
||||
|
||||
return putRehash(data, idx, hash);
|
||||
|
||||
}
|
||||
|
||||
private CellData putRehash(CellData data, long idx, long hash) {
|
||||
final long pStride = 1 + (hash % (hashTableSize - 2));
|
||||
|
||||
for (long j = 1; j < maxProbeLength; j++) {
|
||||
idx = idx - pStride;
|
||||
|
||||
if (idx < 0) {
|
||||
idx += hashTableSize;
|
||||
}
|
||||
|
||||
final var val = getCell(idx);
|
||||
|
||||
if (!val.isSet()) {
|
||||
return setValue(data, idx);
|
||||
}
|
||||
else if (val.getKey() == data.getKey()) {
|
||||
logger.error("Double write?");
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%, key = " + data.getKey() + ",#"+hash);
|
||||
}
|
||||
|
||||
private CellData setValue(CellData data, long cell) {
|
||||
sz++;
|
||||
|
||||
setCell(cell, data);
|
||||
return data;
|
||||
}
|
||||
|
||||
public CellData get(int key) {
|
||||
if (hashTableSize == 0) {
|
||||
return new CellData(0, 0);
|
||||
}
|
||||
final long hash = longHash(key) & 0x7FFF_FFFFL;
|
||||
|
||||
var val = getCell(hash % hashTableSize);
|
||||
if (!val.isSet()) {
|
||||
return val;
|
||||
}
|
||||
else if (val.getKey() == key) {
|
||||
return val;
|
||||
}
|
||||
|
||||
return getRehash(key, hash % hashTableSize, hash);
|
||||
}
|
||||
|
||||
private CellData getRehash(int key, long idx, long hash) {
|
||||
final long pStride = 1 + (hash % (hashTableSize - 2));
|
||||
|
||||
for (long j = 1; j < maxProbeLength; j++) {
|
||||
idx = idx - pStride;
|
||||
|
||||
if (idx < 0) {
|
||||
idx += hashTableSize;
|
||||
}
|
||||
|
||||
final var val = getCell(idx);
|
||||
|
||||
if (!val.isSet()) {
|
||||
return val;
|
||||
}
|
||||
else if (val.getKey() == key) {
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%");
|
||||
}
|
||||
|
||||
private long longHash(long x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
@Getter @EqualsAndHashCode
|
||||
public static class CellData {
|
||||
final long first;
|
||||
final long second;
|
||||
|
||||
public CellData(long key, long offset) {
|
||||
first = key | 0x8000_0000_000_000L;
|
||||
second = offset;
|
||||
}
|
||||
|
||||
public long getKey() {
|
||||
return first & ~0x8000_0000_000_000L;
|
||||
}
|
||||
public long getOffset() {
|
||||
return second;
|
||||
}
|
||||
|
||||
public boolean isSet() {
|
||||
return first != 0 || second != 0L;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws Exception {
|
||||
data.close();
|
||||
}
|
||||
}
|
@ -6,8 +6,6 @@ import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
|
||||
import javax.inject.Inject;
|
||||
@ -33,7 +31,7 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
|
||||
|
||||
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||
public EdgePageWords extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
@ -47,15 +45,15 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
List<String> artifacts = getArtifacts(documentLanguageData);
|
||||
|
||||
keywordMetadata.flagsTemplate().add(EdgePageWordFlags.Simple);
|
||||
WordsBuilder wordsBuilder = new WordsBuilder();
|
||||
|
||||
return new EdgePageWordSet(
|
||||
createWords(keywordMetadata, IndexBlock.Title, titleWords),
|
||||
EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
|
||||
);
|
||||
createWords(wordsBuilder, keywordMetadata, titleWords, 0);
|
||||
artifacts.forEach(wordsBuilder::addWithBlankMetadata);
|
||||
|
||||
return wordsBuilder.build();
|
||||
}
|
||||
|
||||
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||
public EdgePageWords extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
|
||||
@ -72,26 +70,25 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
List<String> artifacts = getArtifacts(documentLanguageData);
|
||||
|
||||
var wordSet = new EdgePageWordSet(
|
||||
createWords(keywordMetadata, IndexBlock.Title, titleWords),
|
||||
createWords(keywordMetadata, IndexBlock.Tfidf_High, wordsTfIdf),
|
||||
createWords(keywordMetadata, IndexBlock.Subjects, subjects),
|
||||
EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
|
||||
);
|
||||
WordsBuilder wordsBuilder = new WordsBuilder();
|
||||
|
||||
getSimpleWords(keywordMetadata, wordSet, documentLanguageData,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
|
||||
createWords(wordsBuilder, keywordMetadata, titleWords, 0);
|
||||
createWords(wordsBuilder, keywordMetadata, wordsTfIdf, EdgePageWordFlags.TfIdfHigh.asBit());
|
||||
createWords(wordsBuilder, keywordMetadata, subjects, 0);
|
||||
|
||||
return wordSet;
|
||||
getSimpleWords(wordsBuilder, keywordMetadata, documentLanguageData);
|
||||
|
||||
artifacts.forEach(wordsBuilder::addWithBlankMetadata);
|
||||
|
||||
return wordsBuilder.build();
|
||||
}
|
||||
|
||||
|
||||
public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
Map<String, Integer> ret = keywordMetadata.positionMask();
|
||||
|
||||
int posCtr = 0;
|
||||
for (var sent : dld.titleSentences) {
|
||||
int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
|
||||
int posBit = 1;
|
||||
|
||||
for (var word : sent) {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
@ -101,9 +98,11 @@ public class DocumentKeywordExtractor {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
}
|
||||
posCtr+=4;
|
||||
|
||||
int pos = 1;
|
||||
int line = 0;
|
||||
for (var sent : dld.sentences) {
|
||||
int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
|
||||
int posBit = (int)((1L << pos) & 0xFFFF_FFFFL);
|
||||
|
||||
for (var word : sent) {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
@ -113,7 +112,28 @@ public class DocumentKeywordExtractor {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
posCtr++;
|
||||
if (pos < 4) pos ++;
|
||||
else if (pos < 8) {
|
||||
if (++line >= 2) {
|
||||
pos++;
|
||||
line = 0;
|
||||
}
|
||||
}
|
||||
else if (pos < 24) {
|
||||
if (++line >= 4) {
|
||||
pos++;
|
||||
line = 0;
|
||||
}
|
||||
}
|
||||
else if (pos < 64) {
|
||||
if (++line > 8) {
|
||||
pos++;
|
||||
line = 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -122,43 +142,32 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
|
||||
|
||||
private void getSimpleWords(KeywordMetadata metadata, EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
|
||||
private void getSimpleWords(WordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) {
|
||||
|
||||
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
|
||||
|
||||
int start = 0;
|
||||
int lengthGoal = 32;
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
|
||||
for (int blockIdx = 0; blockIdx < blocks.length && start < documentLanguageData.sentences.length; blockIdx++) {
|
||||
IndexBlock block = blocks[blockIdx];
|
||||
Set<EdgePageWords.Entry> words = new HashSet<>(lengthGoal+100);
|
||||
if (wordsBuilder.size() > 1500)
|
||||
break;
|
||||
|
||||
int pos;
|
||||
int length = 0;
|
||||
for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) {
|
||||
var sent = documentLanguageData.sentences[pos];
|
||||
length += sent.length();
|
||||
|
||||
for (var word : sent) {
|
||||
if (!word.isStopWord()) {
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||
words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, word.stemmed())));
|
||||
}
|
||||
for (var word : sent) {
|
||||
if (!word.isStopWord()) {
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||
wordsBuilder.add(w, metadata.forWord(flagsTemplate, word.stemmed()));
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getNames(sent)) {
|
||||
var rep = new WordRep(sent, names);
|
||||
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||
|
||||
words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, rep.stemmed)));
|
||||
}
|
||||
}
|
||||
wordSet.append(block, words);
|
||||
start = pos;
|
||||
lengthGoal+=32;
|
||||
|
||||
for (var names : keywordExtractor.getNames(sent)) {
|
||||
var rep = new WordRep(sent, names);
|
||||
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||
|
||||
wordsBuilder.add(w, metadata.forWord(flagsTemplate, rep.stemmed));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
|
||||
@ -197,11 +206,11 @@ public class DocumentKeywordExtractor {
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public EdgePageWords createWords(KeywordMetadata metadata,
|
||||
IndexBlock block,
|
||||
Collection<WordRep> words) {
|
||||
public void createWords(WordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
Collection<WordRep> words,
|
||||
long additionalMeta) {
|
||||
|
||||
Set<EdgePageWords.Entry> entries = new HashSet<>(words.size());
|
||||
for (var word : words) {
|
||||
|
||||
String flatWord = AsciiFlattener.flattenUnicode(word.word);
|
||||
@ -209,9 +218,31 @@ public class DocumentKeywordExtractor {
|
||||
continue;
|
||||
}
|
||||
|
||||
entries.add(new EdgePageWords.Entry(flatWord, metadata.forWord(metadata.flagsTemplate(), word.stemmed)));
|
||||
wordsBuilder.add(flatWord, metadata.forWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
|
||||
}
|
||||
}
|
||||
|
||||
private static class WordsBuilder {
|
||||
private final EdgePageWords words = new EdgePageWords(1600);
|
||||
private final Set<String> seen = new HashSet<>(1600);
|
||||
|
||||
public void add(String word, long meta) {
|
||||
if (seen.add(word)) {
|
||||
words.add(word, meta);
|
||||
}
|
||||
}
|
||||
public void addWithBlankMetadata(String word) {
|
||||
if (seen.add(word)) {
|
||||
words.addJustNoMeta(word);
|
||||
}
|
||||
}
|
||||
|
||||
return new EdgePageWords(block, entries);
|
||||
public EdgePageWords build() {
|
||||
return words;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return seen.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import com.github.jknack.handlebars.internal.lang3.StringUtils;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
@ -27,7 +27,7 @@ public class KeywordCounter {
|
||||
}
|
||||
|
||||
public List<WordRep> countHisto(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
TObjectIntHashMap<String> counts = new TObjectIntHashMap<>(10_000, 0.7f);
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
|
||||
|
||||
|
||||
@ -41,7 +41,8 @@ public class KeywordCounter {
|
||||
|
||||
var rep = new WordRep(sent, span);
|
||||
|
||||
counts.adjustOrPutValue(rep.stemmed, 1, 1);
|
||||
counts.mergeInt(rep.stemmed, 1, Integer::sum);
|
||||
|
||||
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500));
|
||||
if (instanceSet.size() < 250) {
|
||||
instanceSet.add(rep);
|
||||
@ -54,7 +55,8 @@ public class KeywordCounter {
|
||||
|
||||
int maxVal = maxValue(counts);
|
||||
|
||||
counts.forEachEntry((key, cnt) -> {
|
||||
|
||||
counts.forEach((key, cnt) -> {
|
||||
int value = getTermValue(key, cnt, maxVal);
|
||||
|
||||
tfIdf.put(key, new WordFrequencyData(cnt, value));
|
||||
@ -62,18 +64,18 @@ public class KeywordCounter {
|
||||
if (cnt > 1 && value > 100) {
|
||||
tfIdfHigh.addAll(instances.get(key));
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
|
||||
return tfIdfHigh;
|
||||
}
|
||||
|
||||
private int maxValue(TObjectIntHashMap<?> map) {
|
||||
private int maxValue(Object2IntOpenHashMap<?> map) {
|
||||
int maxC = 0;
|
||||
|
||||
for (int c : map.values()) {
|
||||
maxC = max(c, maxC);
|
||||
}
|
||||
|
||||
return maxC;
|
||||
}
|
||||
|
||||
|
@ -32,7 +32,9 @@ public class KeywordExtractor {
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||
|
||||
if (isProperNoun(i, sentence) && (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence)) && isProperNoun(i-2, sentence))
|
||||
if (isProperNoun(i, sentence)
|
||||
&& (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence))
|
||||
&& isProperNoun(i-2, sentence))
|
||||
spans.add(new WordSpan(i-2, i+1));
|
||||
}
|
||||
|
||||
@ -42,59 +44,91 @@ public class KeywordExtractor {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
|
||||
|
||||
if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
|
||||
if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) {
|
||||
if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence))
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
}
|
||||
else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT")) {
|
||||
else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT"))
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
}
|
||||
else if ((isJoiner(sentence, i-1)||isProperNoun(i - 1, sentence)) && (isJoiner(sentence, i-2)||isProperNoun(i - 2, sentence))) {
|
||||
else if ((isJoiner(sentence, i-1) ||isProperNoun(i-1, sentence))
|
||||
&& (isJoiner(sentence, i-2)||isProperNoun(i-2, sentence)))
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return spans.toArray(WordSpan[]::new);
|
||||
}
|
||||
|
||||
public WordSpan[] getNamesStrict(DocumentSentence sentence) {
|
||||
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
|
||||
if (sentence.keywords != null) {
|
||||
return sentence.keywords.get();
|
||||
}
|
||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
||||
|
||||
Set<String> topWords = Collections.emptySet();
|
||||
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (isProperNoun(i, sentence))
|
||||
if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords))
|
||||
spans.add(new WordSpan(i, i+1));
|
||||
}
|
||||
|
||||
for (int i = 1; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||
if (isProperNoun(i, sentence) && isProperNoun(i-1, sentence))
|
||||
spans.add(new WordSpan(i-1, i+1));
|
||||
|
||||
if (isName(i, sentence, topWords)) {
|
||||
if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
|
||||
spans.add(new WordSpan(i - 1, i + 1));
|
||||
}
|
||||
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords)) {
|
||||
spans.add(new WordSpan(i - 1, i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 2; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (isProperNoun(i, sentence) && (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence)) && isProperNoun(i-2, sentence))
|
||||
spans.add(new WordSpan(i-2, i+1));
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence, topWords)) {
|
||||
if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
|
||||
&& (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
}
|
||||
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 3; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
|
||||
if (isName(i, sentence, topWords) &&
|
||||
(isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) &&
|
||||
(isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) &&
|
||||
(isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) {
|
||||
spans.add(new WordSpan(i - 3, i + 1));
|
||||
}
|
||||
else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
|
||||
if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) {
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
}
|
||||
else if (isJoiner(sentence, i-1) && sentence.posTags[i-2].equals("DT")) {
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
}
|
||||
else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && (isProperNoun(i-2, sentence)|| isJoiner(sentence, i-2))) {
|
||||
spans.add(new WordSpan(i-3, i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return spans.toArray(WordSpan[]::new);
|
||||
var ret = spans.toArray(WordSpan[]::new);
|
||||
sentence.keywords = new SoftReference<>(ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public boolean isProperNoun(int i, DocumentSentence sent) {
|
||||
@ -149,139 +183,6 @@ public class KeywordExtractor {
|
||||
return true;
|
||||
}
|
||||
|
||||
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
|
||||
if (sentence.keywords != null) {
|
||||
return sentence.keywords.get();
|
||||
}
|
||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
||||
|
||||
Set<String> topWords = Collections.emptySet();
|
||||
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords))
|
||||
spans.add(new WordSpan(i, i+1));
|
||||
}
|
||||
|
||||
for (int i = 1; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence, topWords)) {
|
||||
if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
|
||||
spans.add(new WordSpan(i - 1, i + 1));
|
||||
}
|
||||
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords)) {
|
||||
spans.add(new WordSpan(i - 1, i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 2; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence, topWords)) {
|
||||
if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
|
||||
&& (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
}
|
||||
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 3; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence, topWords) &&
|
||||
(isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) &&
|
||||
(isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) &&
|
||||
(isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) {
|
||||
spans.add(new WordSpan(i - 3, i + 1));
|
||||
}
|
||||
else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
|
||||
if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) {
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
}
|
||||
else if (isJoiner(sentence, i-1) && sentence.posTags[i-2].equals("DT")) {
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
}
|
||||
else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && (isProperNoun(i-2, sentence)|| isJoiner(sentence, i-2))) {
|
||||
spans.add(new WordSpan(i-3, i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
var ret = spans.toArray(WordSpan[]::new);
|
||||
sentence.keywords = new SoftReference<>(ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public WordSpan[] getKeywordsFromSentenceStrict(DocumentSentence sentence, Set<String> topWords, boolean reducePartials) {
|
||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
||||
|
||||
if (!reducePartials) {
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (topWords.contains(sentence.stemmedWords[i]))
|
||||
spans.add(new WordSpan(i, i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 1; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (topWords.contains(sentence.stemmedWords[i])
|
||||
&& !sentence.words[i].endsWith("'s")
|
||||
&& topWords.contains(sentence.stemmedWords[i-1])) {
|
||||
spans.add(new WordSpan(i-1, i + 1));
|
||||
}
|
||||
}
|
||||
for (int i = 2; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||
|
||||
if (topWords.contains(sentence.stemmedWords[i])
|
||||
&& !sentence.words[i].endsWith("'s")
|
||||
&& (topWords.contains(sentence.stemmedWords[i-1]) || isJoiner(sentence, i-1))
|
||||
&& topWords.contains(sentence.stemmedWords[i-2])
|
||||
) {
|
||||
spans.add(new WordSpan(i-2, i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 3; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
|
||||
if (!sentence.words[i-2].endsWith("'s")) { continue; }
|
||||
if (!sentence.words[i-3].endsWith("'s")) { continue; }
|
||||
|
||||
if (topWords.contains(sentence.stemmedWords[i])
|
||||
&& !sentence.words[i].endsWith("'s") && topWords.contains(sentence.stemmedWords[i-3])) {
|
||||
if (topWords.contains(sentence.stemmedWords[i-1]) && topWords.contains(sentence.stemmedWords[i-2])) {
|
||||
spans.add(new WordSpan(i-3, i + 1));
|
||||
}
|
||||
else if (topWords.contains(sentence.stemmedWords[i-1]) && isJoiner(sentence, i-2)) {
|
||||
spans.add(new WordSpan(i-3, i + 1));
|
||||
}
|
||||
else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT")) {
|
||||
spans.add(new WordSpan(i-3, i + 1));
|
||||
}
|
||||
else if (isJoiner(sentence, i-2) && isJoiner(sentence, i-1)) {
|
||||
spans.add(new WordSpan(i-3, i + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return spans.toArray(WordSpan[]::new);
|
||||
}
|
||||
|
||||
private boolean isName(int i, DocumentSentence sentence, Set<String> topWords) {
|
||||
if (!topWords.isEmpty()) {
|
||||
String posTag = sentence.posTags[i];
|
||||
@ -293,7 +194,6 @@ public class KeywordExtractor {
|
||||
|
||||
String posTag = sentence.posTags[i];
|
||||
|
||||
// if (posTag.startsWith("N") || posTag.startsWith("V") || posTag.startsWith("R") || posTag.startsWith("J"))
|
||||
return (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i);
|
||||
}
|
||||
|
||||
|
@ -98,7 +98,6 @@ public class SentenceExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
|
||||
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
|
||||
|
@ -13,23 +13,22 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
|
||||
HashSet<String> namesKeywords,
|
||||
HashMap<String, KeywordCounter.WordFrequencyData> wordsTfIdf,
|
||||
HashMap<String, Integer> positionMask,
|
||||
EnumSet<EdgePageWordFlags> flagsTemplate,
|
||||
int quality
|
||||
EnumSet<EdgePageWordFlags> wordFlagsTemplate
|
||||
)
|
||||
{
|
||||
|
||||
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
|
||||
private static final int TF_IDF_HIGH_LIMIT = 64;
|
||||
|
||||
public KeywordMetadata(double quality, EnumSet<EdgePageWordFlags> flags) {
|
||||
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
||||
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
|
||||
new HashMap<>(15_000),
|
||||
new HashMap<>(10_000),
|
||||
flags,
|
||||
(int)(-quality));
|
||||
flags);
|
||||
}
|
||||
|
||||
public KeywordMetadata(double quality) {
|
||||
this(quality, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||
public KeywordMetadata() {
|
||||
this(EnumSet.noneOf(EdgePageWordFlags.class));
|
||||
}
|
||||
|
||||
public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
||||
@ -48,11 +47,7 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
|
||||
|
||||
int positions = positionMask.getOrDefault(stemmed, 0);
|
||||
|
||||
return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, quality, tfidf.count(), flags).encode();
|
||||
}
|
||||
|
||||
public int quality() {
|
||||
return -quality;
|
||||
return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, tfidf.count(), flags).encode();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,859 +0,0 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.btree.BTreeQueryBuffer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.MappedByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import static java.nio.channels.FileChannel.MapMode.READ_ONLY;
|
||||
import static java.nio.channels.FileChannel.MapMode.READ_WRITE;
|
||||
import static nu.marginalia.util.FileSizeUtil.readableSize;
|
||||
|
||||
|
||||
public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
|
||||
private final ArrayList<LongBuffer> buffers = new ArrayList<>();
|
||||
private final ArrayList<MappedByteBuffer> mappedByteBuffers = new ArrayList<>();
|
||||
private final FileChannel.MapMode mode;
|
||||
private final int bufferSize;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final FileChannel channel;
|
||||
|
||||
private final long mapSize;
|
||||
private final long fileLength;
|
||||
private long mappedSize;
|
||||
final static long WORD_SIZE = 8;
|
||||
|
||||
private NativeIO.Advice defaultAdvice = null;
|
||||
|
||||
public static MultimapFileLong forReading(Path file) throws IOException {
|
||||
long fileSize = Files.size(file);
|
||||
int bufferSize = getBufferSize(fileSize, false);
|
||||
|
||||
return new MultimapFileLong(file.toFile(), READ_ONLY, Files.size(file), bufferSize);
|
||||
}
|
||||
|
||||
public static MultimapFileLong forOutput(Path file, long estimatedSize) throws IOException {
|
||||
return new MultimapFileLong(file.toFile(), READ_WRITE, 0, getBufferSize(estimatedSize, true));
|
||||
}
|
||||
|
||||
private static int getBufferSize(long totalSize, boolean write) {
|
||||
int defaultBig = 2<<23;
|
||||
if (totalSize > Integer.MAX_VALUE/WORD_SIZE) {
|
||||
return defaultBig;
|
||||
}
|
||||
else if (write && totalSize < 8*1024*1024) {
|
||||
return 8*1024*1024;
|
||||
}
|
||||
else {
|
||||
return (int) Math.min(totalSize, defaultBig);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public MultimapFileLong(File file,
|
||||
FileChannel.MapMode mode,
|
||||
long mapSize,
|
||||
int bufferSize) throws IOException {
|
||||
|
||||
this(new RandomAccessFile(file, translateToRAFMode(mode)), mode, mapSize, bufferSize);
|
||||
}
|
||||
|
||||
private static String translateToRAFMode(FileChannel.MapMode mode) {
|
||||
if (READ_ONLY.equals(mode)) {
|
||||
return "r";
|
||||
} else if (READ_WRITE.equals(mode)) {
|
||||
return "rw";
|
||||
}
|
||||
return "rw";
|
||||
}
|
||||
|
||||
|
||||
public MultimapFileLong(RandomAccessFile file,
|
||||
FileChannel.MapMode mode,
|
||||
long mapSizeBytes,
|
||||
int bufferSizeWords) throws IOException {
|
||||
this.mode = mode;
|
||||
this.bufferSize = bufferSizeWords;
|
||||
this.mapSize = mapSizeBytes;
|
||||
this.fileLength = file.length();
|
||||
|
||||
channel = file.getChannel();
|
||||
mappedSize = 0;
|
||||
|
||||
logger.trace("Creating multimap file size = {} / buffer size = {}, mode = {}",
|
||||
readableSize(mapSizeBytes), readableSize(8L*bufferSizeWords), mode);
|
||||
}
|
||||
|
||||
public MultimapSearcherBase createSearcher() {
|
||||
return new MultimapSearcherBase(this);
|
||||
}
|
||||
public MultimapSorter createSorter(Path tmpFile, int internalSortLimit, int minStepSize) {
|
||||
return new MultimapSorter(this, tmpFile, internalSortLimit, minStepSize);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void advice(NativeIO.Advice advice) {
|
||||
this.defaultAdvice = advice;
|
||||
for (var buffer : mappedByteBuffers) {
|
||||
NativeIO.madvise(buffer, advice);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void advice0(NativeIO.Advice advice) {
|
||||
NativeIO.madvise(mappedByteBuffers.get(0), advice);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void adviceRange(NativeIO.Advice advice, long startLongs, long lengthLongs) {
|
||||
long endLongs = (startLongs+lengthLongs);
|
||||
|
||||
if (endLongs >= mappedSize)
|
||||
grow(endLongs);
|
||||
|
||||
|
||||
int startIdx = (int)(startLongs / bufferSize);
|
||||
int endIdx = (int)(endLongs / bufferSize);
|
||||
|
||||
if (startIdx != endIdx) {
|
||||
long offsetStart = (startLongs % bufferSize) * WORD_SIZE;
|
||||
NativeIO.madviseRange(mappedByteBuffers.get(startIdx), advice, offsetStart, (int) (bufferSize * WORD_SIZE - offsetStart));
|
||||
for (int i = startIdx+1; i < endIdx; i++) {
|
||||
NativeIO.madviseRange(mappedByteBuffers.get(i), advice, 0, (int)(bufferSize * WORD_SIZE));
|
||||
}
|
||||
NativeIO.madviseRange(mappedByteBuffers.get(endIdx), advice, 0, (int)((endIdx % bufferSize) * WORD_SIZE));
|
||||
}
|
||||
else {
|
||||
var buff = mappedByteBuffers.get(startIdx);
|
||||
NativeIO.madviseRange(buff, advice, (startLongs % bufferSize) * WORD_SIZE, (int) (lengthLongs * WORD_SIZE));
|
||||
}
|
||||
}
|
||||
|
||||
public void pokeRange(long offset, long length) {
|
||||
for (long i = 0; i < length; i += 4096/8) {
|
||||
get(offset + i);
|
||||
}
|
||||
}
|
||||
|
||||
public void force() {
|
||||
logger.trace("Forcing");
|
||||
|
||||
for (MappedByteBuffer buffer: mappedByteBuffers) {
|
||||
buffer.force();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void grow(long posIdxRequired) {
|
||||
if (posIdxRequired*WORD_SIZE > mapSize && mode == READ_ONLY) {
|
||||
throw new IndexOutOfBoundsException(posIdxRequired + " (max " + mapSize + ")");
|
||||
}
|
||||
logger.trace("Growing to encompass {}i/{}b", posIdxRequired, posIdxRequired*WORD_SIZE);
|
||||
long start;
|
||||
if (buffers.isEmpty()) {
|
||||
start = 0;
|
||||
}
|
||||
else {
|
||||
start = (long) buffers.size() * bufferSize;
|
||||
}
|
||||
for (long posIdx = start; posIdxRequired >= posIdx; posIdx += bufferSize) {
|
||||
long posBytes = posIdx * WORD_SIZE;
|
||||
long bzBytes;
|
||||
if (mode == READ_ONLY) {
|
||||
bzBytes = Math.min(WORD_SIZE*bufferSize, mapSize - posBytes);
|
||||
}
|
||||
else {
|
||||
bzBytes = WORD_SIZE*bufferSize;
|
||||
}
|
||||
logger.trace("Allocating {}-{}", posBytes, posBytes+bzBytes);
|
||||
|
||||
var buffer = channel.map(mode, posBytes, bzBytes);
|
||||
|
||||
if (defaultAdvice != null) {
|
||||
NativeIO.madvise(buffer, defaultAdvice);
|
||||
}
|
||||
|
||||
buffers.add(buffer.asLongBuffer());
|
||||
mappedByteBuffers.add(buffer);
|
||||
|
||||
mappedSize += bzBytes/WORD_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return fileLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put(long idx, long val) {
|
||||
if (idx >= mappedSize)
|
||||
grow(idx);
|
||||
|
||||
try {
|
||||
buffers.get((int)(idx / bufferSize)).put((int) (idx % bufferSize), val);
|
||||
}
|
||||
catch (IndexOutOfBoundsException ex) {
|
||||
logger.error("Index out of bounds {} -> {}:{} cap {}", idx, buffers.get((int)(idx / bufferSize)), idx % bufferSize,
|
||||
buffers.get((int)(idx / bufferSize)).capacity());
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long get(long idx) {
|
||||
if (idx < 0)
|
||||
throw new IllegalArgumentException("get("+idx+")");
|
||||
|
||||
if (idx >= mappedSize)
|
||||
grow(idx);
|
||||
|
||||
try {
|
||||
return buffers.get((int)(idx / bufferSize)).get((int)(idx % bufferSize));
|
||||
}
|
||||
catch (IndexOutOfBoundsException ex) {
|
||||
logger.error("Index out of bounds {} -> {}:{} cap {}", idx, buffers.get((int)(idx / bufferSize)), idx % bufferSize,
|
||||
buffers.get((int)(idx / bufferSize)).capacity());
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void read(long[] vals, long idx) {
|
||||
read(vals, vals.length, idx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(long[] vals, int n, long idx) {
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
}
|
||||
|
||||
int iN = (int)((idx + n) / bufferSize);
|
||||
|
||||
for (int i = 0; i < n; ) {
|
||||
int i0 = (int)((idx + i) / bufferSize);
|
||||
int bufferOffset = (int) ((idx+i) % bufferSize);
|
||||
var buffer = buffers.get(i0);
|
||||
|
||||
final int l;
|
||||
|
||||
if (i0 < iN) l = bufferSize - bufferOffset;
|
||||
else l = Math.min(n - i, bufferSize - bufferOffset);
|
||||
|
||||
buffer.get(bufferOffset, vals, i, l);
|
||||
i+=l;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(LongBuffer vals, long idx) {
|
||||
int n = vals.limit() - vals.position();
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
}
|
||||
int iN = (int)((idx + n) / bufferSize);
|
||||
|
||||
for (int i = 0; i < n; ) {
|
||||
int i0 = (int)((idx + i) / bufferSize);
|
||||
|
||||
int bufferOffset = (int) ((idx+i) % bufferSize);
|
||||
var buffer = buffers.get(i0);
|
||||
|
||||
final int l;
|
||||
|
||||
if (i0 < iN) l = bufferSize - bufferOffset;
|
||||
else l = Math.min(n - i, bufferSize - bufferOffset);
|
||||
|
||||
vals.put(vals.position() + i, buffer, bufferOffset, l);
|
||||
i+=l;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void write(long[] vals, long idx) {
|
||||
write(vals, vals.length, idx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(long[] vals, int n, long idx) {
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
}
|
||||
|
||||
int iN = (int)((idx + n) / bufferSize);
|
||||
|
||||
for (int i = 0; i < n; ) {
|
||||
int i0 = (int)((idx + i) / bufferSize);
|
||||
int bufferOffset = (int) ((idx+i) % bufferSize);
|
||||
var buffer = buffers.get(i0);
|
||||
|
||||
final int l;
|
||||
|
||||
if (i0 < iN) l = bufferSize - bufferOffset;
|
||||
else l = Math.min(n - i, bufferSize - bufferOffset);
|
||||
|
||||
buffer.put(bufferOffset, vals, i, l);
|
||||
i+=l;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(LongBuffer vals, long idx) {
|
||||
int n = vals.limit() - vals.position();
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
}
|
||||
int iN = (int)((idx + n) / bufferSize);
|
||||
|
||||
for (int i = 0; i < n; ) {
|
||||
int i0 = (int)((idx + i) / bufferSize);
|
||||
|
||||
int bufferOffset = (int) ((idx+i) % bufferSize);
|
||||
var buffer = buffers.get(i0);
|
||||
|
||||
final int l;
|
||||
|
||||
if (i0 < iN) l = bufferSize - bufferOffset;
|
||||
else l = Math.min(n - i, bufferSize - bufferOffset);
|
||||
|
||||
buffer.put(bufferOffset, vals, vals.position() + i, l);
|
||||
i+=l;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void write(LongBuffer vals, int n, long idx) {
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
}
|
||||
int iN = (int)((idx + n) / bufferSize);
|
||||
|
||||
for (int i = 0; i < n; ) {
|
||||
int i0 = (int)((idx + i) / bufferSize);
|
||||
|
||||
int bufferOffset = (int) ((idx+i) % bufferSize);
|
||||
var buffer = buffers.get(i0);
|
||||
|
||||
final int l;
|
||||
|
||||
if (i0 < iN) l = bufferSize - bufferOffset;
|
||||
else l = Math.min(n - i, bufferSize - bufferOffset);
|
||||
|
||||
buffer.put(bufferOffset, vals, vals.position() + i, l);
|
||||
i+=l;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void swapn(int n, long idx1, long idx2) {
|
||||
for (int i = 0; i < n; i++)
|
||||
swap(idx1+i, idx2+i);
|
||||
}
|
||||
|
||||
private void swap(long idx1, long idx2) {
|
||||
LongBuffer buff1 = buffers.get((int)(idx1 / bufferSize));
|
||||
final int o1 = (int) (idx1 % bufferSize);
|
||||
|
||||
LongBuffer buff2 = buffers.get((int)(idx2 / bufferSize));
|
||||
final int o2 = (int) (idx2 % bufferSize);
|
||||
|
||||
long tmp = buff1.get(o1);
|
||||
buff1.put(o1, buff2.get(o2));
|
||||
buff2.put(o2, tmp);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setRange(long idx, int n, long val) {
|
||||
if (n == 0) return;
|
||||
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
}
|
||||
int iN = (int)((idx + n) / bufferSize);
|
||||
|
||||
for (int i = 0; i < n; ) {
|
||||
int i0 = (int)((idx + i) / bufferSize);
|
||||
|
||||
int bufferOffset = (int) ((idx+i) % bufferSize);
|
||||
var buffer = buffers.get(i0);
|
||||
|
||||
final int l;
|
||||
|
||||
if (i0 < iN) l = bufferSize - bufferOffset;
|
||||
else l = Math.min(n - i, bufferSize - bufferOffset);
|
||||
|
||||
for (int p = 0; p < l; p++) {
|
||||
buffer.put(bufferOffset + p, val);
|
||||
}
|
||||
|
||||
i+=l;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
|
||||
|
||||
int length = (int)(sourceEnd - sourceStart);
|
||||
|
||||
if (destOffset+length >= mappedSize) {
|
||||
grow(destOffset+length);
|
||||
}
|
||||
|
||||
int i0 = (int)((destOffset) / bufferSize);
|
||||
int iN = (int)((destOffset + length) / bufferSize);
|
||||
|
||||
int numBuffers = iN - i0 + 1;
|
||||
ByteBuffer[] buffers = new ByteBuffer[numBuffers];
|
||||
for (int i = 0; i < numBuffers; i++) {
|
||||
buffers[i] = mappedByteBuffers.get(i0 + i);
|
||||
buffers[i].clear();
|
||||
}
|
||||
if (i0 != iN) {
|
||||
int startBuf0 = (int) ((destOffset) % bufferSize) * 8;
|
||||
int endBuf0 = buffers[0].capacity() - (int) (destOffset % bufferSize) * 8;
|
||||
int endBufN = (int)((destOffset + length) % bufferSize)*8;
|
||||
buffers[0] = buffers[0].slice(startBuf0, endBuf0);
|
||||
buffers[numBuffers-1] = buffers[numBuffers-1].slice(0, endBufN);
|
||||
}
|
||||
else {
|
||||
buffers[0] = buffers[0].slice((int) (destOffset % bufferSize) * 8, 8*length);
|
||||
}
|
||||
|
||||
sourceChannel.position(sourceStart*8);
|
||||
|
||||
long twb = 0;
|
||||
while (twb < length * 8L) {
|
||||
long rb = sourceChannel.read(buffers, 0, buffers.length);
|
||||
if (rb < 0)
|
||||
throw new IOException();
|
||||
twb += rb;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
|
||||
if (fromIndex + n*step >= mappedSize)
|
||||
grow(fromIndex + n*step);
|
||||
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
if (isSameBuffer(fromIndex, fromIndex+step*n)) {
|
||||
int idx = (int)(fromIndex / bufferSize);
|
||||
var buffer = buffers.get(idx);
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid*step;
|
||||
long midVal = buffer.get((int)(off % bufferSize)) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid*step;
|
||||
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
}
|
||||
|
||||
return -1L-(fromIndex + high*step);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
|
||||
if (fromIndex + n >= mappedSize)
|
||||
grow(fromIndex + n);
|
||||
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
if (isSameBuffer(fromIndex, fromIndex+n)) {
|
||||
int idx = (int)(fromIndex / bufferSize);
|
||||
var buffer = buffers.get(idx);
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffer.get((int)(off % bufferSize)) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
|
||||
return -1L-(fromIndex + high);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, long n) {
|
||||
if (fromIndex + n >= mappedSize)
|
||||
grow(fromIndex + n);
|
||||
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
if (isSameBuffer(fromIndex, fromIndex+n)) {
|
||||
int idx = (int)(fromIndex / bufferSize);
|
||||
var buffer = buffers.get(idx);
|
||||
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffer.get((int)(off % bufferSize));
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
|
||||
return -1L-(fromIndex + high);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public long binarySearchUpperInternal(long key, long fromIndex, long n) {
|
||||
if (fromIndex + n >= mappedSize)
|
||||
grow(fromIndex + n);
|
||||
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
if (isSameBuffer(fromIndex, fromIndex+n)) {
|
||||
int idx = (int)(fromIndex / bufferSize);
|
||||
var buffer = buffers.get(idx);
|
||||
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffer.get((int)(off % bufferSize));
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
|
||||
return fromIndex + low;
|
||||
}
|
||||
|
||||
private boolean isSameBuffer(long a, long b) {
|
||||
return a / bufferSize == b/bufferSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long quickSortPartition(int wordSize, long low, long high) {
|
||||
if (high >= mappedSize)
|
||||
grow(high + wordSize - 1);
|
||||
|
||||
if (isSameBuffer(low, high + wordSize - 1)) {
|
||||
// Specialization that circumvents the need for expensive calls to
|
||||
// MultimapFileLong.get() in the most common scenario
|
||||
|
||||
return quickSortPartitionSameBuffer(wordSize, low, high);
|
||||
}
|
||||
else {
|
||||
return quickSortPartitionDifferentBuffers(wordSize, low, high);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void insertionSort(int wordSize, long start, int n) {
|
||||
if (start + n + wordSize - 1 >= mappedSize)
|
||||
grow(start + n + wordSize - 1);
|
||||
|
||||
if (n <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (isSameBuffer(start, start + (long)n*wordSize-1L)) {
|
||||
final var buffer = buffers.get((int) (start / bufferSize));
|
||||
int off = (int) (start % bufferSize);
|
||||
|
||||
for (int i = 1; i < n; i++) {
|
||||
long key = buffer.get(off + i * wordSize);
|
||||
|
||||
int j = i - 1;
|
||||
while (j >= 0 && buffer.get(off + wordSize*j) > key) {
|
||||
for (int w = 0; w < wordSize; w++) {
|
||||
long tmp = buffer.get(off+wordSize*j+w);
|
||||
buffer.put(off+wordSize*j+w, buffer.get(off+wordSize*(j+1)+w));
|
||||
buffer.put(off+wordSize*(j+1)+w, tmp);
|
||||
}
|
||||
j--;
|
||||
}
|
||||
buffer.put(off + (j+1) * wordSize, key);
|
||||
}
|
||||
}
|
||||
else for (int i = 1; i < n; i++) {
|
||||
long key = get(start + (long) i * wordSize);
|
||||
|
||||
int j = i - 1;
|
||||
while (j >= 0 && get(start + (long)wordSize*j) > key) {
|
||||
swapn(wordSize, start + (long)wordSize*j, start + (long)wordSize*(j+1));
|
||||
j--;
|
||||
}
|
||||
put(start + (long) (j+1) * wordSize, key);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private long quickSortPartitionDifferentBuffers(int wordSize, long low, long high) {
|
||||
|
||||
long pivotPoint = ((low + high) / (2L*wordSize)) * wordSize;
|
||||
long pivot = get(pivotPoint);
|
||||
|
||||
long i = low - wordSize;
|
||||
long j = high + wordSize;
|
||||
|
||||
for (;;) {
|
||||
do {
|
||||
i+=wordSize;
|
||||
} while (get(i) < pivot);
|
||||
|
||||
do {
|
||||
j-=wordSize;
|
||||
}
|
||||
while (get(j) > pivot);
|
||||
|
||||
if (i >= j) return j;
|
||||
else swapn(wordSize, i, j);
|
||||
}
|
||||
}
|
||||
|
||||
private long quickSortPartitionSameBuffer(int wordSize, long low, long high) {
|
||||
|
||||
final var buffer = buffers.get((int) (low / bufferSize));
|
||||
|
||||
final long pivotPointLong = ((low + high) / (2L*wordSize)) * wordSize;
|
||||
final int pivotPoint = (int) (pivotPointLong % bufferSize);
|
||||
|
||||
final long pivot = buffer.get(pivotPoint);
|
||||
|
||||
int j = (int) (high % bufferSize) + wordSize;
|
||||
int i = (int) (low % bufferSize) - wordSize;
|
||||
|
||||
long j0 = high + wordSize - j;
|
||||
|
||||
for (;;) {
|
||||
do {
|
||||
i+=wordSize;
|
||||
} while (buffer.get(i) < pivot);
|
||||
|
||||
do {
|
||||
j-=wordSize;
|
||||
}
|
||||
while (buffer.get(j) > pivot);
|
||||
|
||||
if (i >= j) return j0 + j;
|
||||
else {
|
||||
for (int w = 0; w < wordSize; w++) {
|
||||
long tmp = buffer.get(i+w);
|
||||
buffer.put(i+w, buffer.get(j+w));
|
||||
buffer.put(j+w, tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void retain(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
|
||||
|
||||
final long end = searchStart + stepSize * numEntries;
|
||||
if (end < mappedSize) {
|
||||
grow(end);
|
||||
}
|
||||
|
||||
long bv = buffer.currentValue() & mask;
|
||||
long av = get(searchStart) & mask;
|
||||
long pos = searchStart;
|
||||
|
||||
int bi = (int)(searchStart / bufferSize);
|
||||
int bo = (int)(searchStart % bufferSize);
|
||||
|
||||
LongBuffer data = buffers.get(bi);
|
||||
|
||||
while (bv <= boundary && buffer.hasMore()) {
|
||||
if (bv < av) {
|
||||
if (!buffer.rejectAndAdvance()) break;
|
||||
bv = buffer.currentValue() & mask;
|
||||
continue;
|
||||
}
|
||||
else if (bv == av) {
|
||||
if (!buffer.retainAndAdvance()) break;
|
||||
bv = buffer.currentValue() & mask;
|
||||
continue;
|
||||
}
|
||||
|
||||
pos += stepSize;
|
||||
if (pos < end) {
|
||||
bo += stepSize;
|
||||
if (bo >= bufferSize) {
|
||||
data = buffers.get(++bi);
|
||||
bo = 0;
|
||||
}
|
||||
av = data.get(bo) & mask;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void reject(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
|
||||
|
||||
final long end = searchStart + stepSize * numEntries;
|
||||
if (end < mappedSize) {
|
||||
grow(end);
|
||||
}
|
||||
|
||||
long bv = buffer.currentValue() & mask;
|
||||
long av = get(searchStart) & mask;
|
||||
long pos = searchStart;
|
||||
|
||||
int bi = (int)(searchStart / bufferSize);
|
||||
int bo = (int)(searchStart % bufferSize);
|
||||
|
||||
LongBuffer data = buffers.get(bi);
|
||||
|
||||
while (bv <= boundary && buffer.hasMore()) {
|
||||
if (bv < av) {
|
||||
if (!buffer.retainAndAdvance()) break;
|
||||
bv = buffer.currentValue() & mask;
|
||||
continue;
|
||||
}
|
||||
else if (bv == av) {
|
||||
if (!buffer.rejectAndAdvance()) break;
|
||||
bv = buffer.currentValue() & mask;
|
||||
continue;
|
||||
}
|
||||
|
||||
pos += stepSize;
|
||||
if (pos < end) {
|
||||
bo += stepSize;
|
||||
if (bo >= bufferSize) {
|
||||
data = buffers.get(++bi);
|
||||
bo = 0;
|
||||
}
|
||||
av = data.get(bo) & mask;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
force();
|
||||
|
||||
mappedByteBuffers.clear();
|
||||
buffers.clear();
|
||||
|
||||
channel.close();
|
||||
|
||||
// I want to believe
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
}
|
||||
}
|
@ -1,120 +0,0 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
|
||||
private final long off;
|
||||
private final MultimapFileLongSlice map;
|
||||
|
||||
public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) {
|
||||
this.off = off;
|
||||
this.map = map;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return map.size() - off;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put(long idx, long val) {
|
||||
map.put(off+idx, val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setRange(long idx, int n, long val) {
|
||||
map.setRange(off+idx, n, val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long get(long idx) {
|
||||
return map.get(off+idx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(long[] vals, long idx) {
|
||||
map.read(vals, idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(long[] vals, int n, long idx) {
|
||||
map.read(vals, n, idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(LongBuffer vals, long idx) { map.read(vals, idx+off); }
|
||||
|
||||
@Override
|
||||
public void write(long[] vals, long idx) {
|
||||
map.write(vals, idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(long[] vals, int n, long idx) {
|
||||
map.write(vals, n, idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(LongBuffer vals, long idx) {
|
||||
map.write(vals, idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(LongBuffer vals, int n, long idx) {
|
||||
map.write(vals, n,idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void swapn(int n, long idx1, long idx2) {
|
||||
map.swapn(n, idx1+off, idx2+off);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
|
||||
throws IOException {
|
||||
map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MultimapFileLongSlice atOffset(long off) {
|
||||
// If we don't override this, the default implementation would build a pyramid of
|
||||
// MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...)))
|
||||
// if this is called iteratively (e.g. to walk over a file)
|
||||
|
||||
return new MultimapFileLongOffsetSlice(map, this.off + off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, long n) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpperInternal(long key, long fromIndex, long n) {
|
||||
throw new UnsupportedOperationException();
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public long quickSortPartition(int wordSize, long low, long highInclusive) {
|
||||
return map.quickSortPartition(wordSize, low+off, highInclusive+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void insertionSort(int wordSize, long start, int n) {
|
||||
map.insertionSort(wordSize, start+off, n);
|
||||
}
|
||||
}
|
@ -1,47 +0,0 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
public interface MultimapFileLongSlice {
|
||||
long size();
|
||||
|
||||
void put(long idx, long val);
|
||||
|
||||
void setRange(long idx, int n, long val);
|
||||
|
||||
long get(long idx);
|
||||
|
||||
void read(long[] vals, long idx);
|
||||
|
||||
void read(long[] vals, int n, long idx);
|
||||
|
||||
void read(LongBuffer vals, long idx);
|
||||
|
||||
void write(long[] vals, long idx);
|
||||
|
||||
void write(long[] vals, int n, long idx);
|
||||
|
||||
void write(LongBuffer vals, long idx);
|
||||
|
||||
void write(LongBuffer vals, int n, long idx);
|
||||
|
||||
void swapn(int n, long idx1, long idx2);
|
||||
|
||||
void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;
|
||||
|
||||
default MultimapFileLongSlice atOffset(long off) {
|
||||
return new MultimapFileLongOffsetSlice(this, off);
|
||||
}
|
||||
long binarySearchInternal(long key, long fromIndex, int step, long n, long mask);
|
||||
long binarySearchInternal(long key, long fromIndex, long n, long mask);
|
||||
|
||||
long binarySearchInternal(long key, long fromIndex, long n);
|
||||
|
||||
long binarySearchUpperInternal(long key, long fromIndex, long n);
|
||||
|
||||
long quickSortPartition(int wordSize, long low, long highInclusive);
|
||||
|
||||
void insertionSort(int wordSize, long start, int n);
|
||||
}
|
@ -1,80 +0,0 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
public interface MultimapSearcher {
|
||||
long binarySearchLower(long key, long fromIndex, long n);
|
||||
long binarySearch(long key, long fromIndex, long n);
|
||||
|
||||
static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
|
||||
if (mask == ~0L && stepSize == 1) {
|
||||
return new SimpleMultimapSearcher(new MultimapSearcherBase(slice));
|
||||
}
|
||||
else if (stepSize == 1) {
|
||||
return new MaskedMultimapSearcher(new MultimapSearcherBase(slice), mask);
|
||||
}
|
||||
else {
|
||||
return new SteppingMaskedMultimapSearcher(new MultimapSearcherBase(slice), mask, stepSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class SimpleMultimapSearcher implements MultimapSearcher {
|
||||
private final MultimapSearcherBase base;
|
||||
|
||||
SimpleMultimapSearcher(MultimapSearcherBase base) {
|
||||
this.base = base;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||
return base.binarySearchLower(key, fromIndex, n);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
return base.binarySearch(key, fromIndex, n);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class MaskedMultimapSearcher implements MultimapSearcher {
|
||||
private final MultimapSearcherBase base;
|
||||
private final long mask;
|
||||
|
||||
MaskedMultimapSearcher(MultimapSearcherBase base, long mask) {
|
||||
this.base = base;
|
||||
this.mask = mask;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||
return base.binarySearchLower(key, fromIndex, n, mask);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
return base.binarySearch(key, fromIndex, n, mask);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class SteppingMaskedMultimapSearcher implements MultimapSearcher {
|
||||
private final MultimapSearcherBase base;
|
||||
private final long mask;
|
||||
private final int step;
|
||||
|
||||
SteppingMaskedMultimapSearcher(MultimapSearcherBase base, long mask, int step) {
|
||||
this.base = base;
|
||||
this.mask = mask;
|
||||
this.step = step;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||
return base.binarySearchLower(key, fromIndex, step, n, mask);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
return base.binarySearch(key, fromIndex, step, n, mask);
|
||||
}
|
||||
}
|
@ -1,86 +0,0 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import lombok.experimental.Delegate;
|
||||
|
||||
public class MultimapSearcherBase {
|
||||
@Delegate
|
||||
private final MultimapFileLongSlice mmf;
|
||||
|
||||
public MultimapSearcherBase(MultimapFileLongSlice mmf) {
|
||||
this.mmf = mmf;
|
||||
}
|
||||
|
||||
public boolean binarySearchTest(long key, long fromIndex, long n) {
|
||||
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||
return mmf.binarySearchUpperInternal(key, fromIndex, n);
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchLower(long key, long fromIndex, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return fromIndex + low;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchLower(long key, long fromIndex, int step, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid*step) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
return fromIndex + low*step;
|
||||
}
|
||||
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
return mmf.binarySearchInternal(key, fromIndex, n);
|
||||
}
|
||||
|
||||
|
||||
public long binarySearch(long key, long fromIndex, long n, long mask) {
|
||||
return mmf.binarySearchInternal(key, fromIndex, n, mask);
|
||||
}
|
||||
|
||||
public long binarySearch(long key, long fromIndex, int step, long n, long mask) {
|
||||
return mmf.binarySearchInternal(key, fromIndex, step, n, mask);
|
||||
}
|
||||
}
|
@ -1,151 +0,0 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;
|
||||
|
||||
public class MultimapSorter {
|
||||
private final Path tmpFileDir;
|
||||
private final MultimapFileLongSlice multimapFileLong;
|
||||
private final LongBuffer buffer;
|
||||
private final int internalSortLimit;
|
||||
private final int wordSize;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(MultimapSorter.class);
|
||||
|
||||
public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit, int wordSize) {
|
||||
this.multimapFileLong = multimapFileLong;
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.internalSortLimit = internalSortLimit;
|
||||
this.wordSize = wordSize;
|
||||
buffer = ByteBuffer.allocateDirect(internalSortLimit * wordSize * 8).asLongBuffer();
|
||||
}
|
||||
|
||||
public void sortRange(long start, long end) throws IOException {
|
||||
if (end - start < internalSortLimit) {
|
||||
quickSortLH(start, end - wordSize);
|
||||
}
|
||||
else {
|
||||
mergeSort(start, (int) (end - start));
|
||||
}
|
||||
|
||||
if (MultimapSorter.class.desiredAssertionStatus()) {
|
||||
for (long lp = start + wordSize; lp < end; lp += wordSize) {
|
||||
if (multimapFileLong.get(lp - wordSize) > multimapFileLong.get(lp)) {
|
||||
|
||||
logger.error("Sort contract breached [{}:{} ({}), ws={}, <isl={}, bc={}]",
|
||||
start, end,
|
||||
end - start,
|
||||
wordSize, end - start < internalSortLimit,
|
||||
buffer.capacity());
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void mergeSort(long start, int lengthLongs) throws IOException {
|
||||
if (lengthLongs == 1)
|
||||
return;
|
||||
|
||||
if (lengthLongs < buffer.capacity()) {
|
||||
mergeSort(start, lengthLongs, buffer);
|
||||
}
|
||||
else {
|
||||
Path tmpFile = Files.createTempFile(tmpFileDir,"sort-"+start+"-"+(start+lengthLongs), ".dat");
|
||||
try (var raf = new RandomAccessFile(tmpFile.toFile(), "rw"); var channel = raf.getChannel()) {
|
||||
var workBuffer =
|
||||
channel.map(FileChannel.MapMode.READ_WRITE, 0, wordSize * lengthLongs * WORD_SIZE)
|
||||
.asLongBuffer();
|
||||
mergeSort(start, lengthLongs, workBuffer);
|
||||
}
|
||||
finally {
|
||||
tmpFile.toFile().delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
private void mergeSort(long start, int length, LongBuffer workBuffer) throws IOException {
|
||||
int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(buffer.capacity()));
|
||||
|
||||
// Do in-memory sorting up until internalSortLimit first
|
||||
for (int i = 0; i < length; i += width) {
|
||||
quickSort(start + i, Math.min(width, length-i));
|
||||
}
|
||||
|
||||
// Then finish with merge sort
|
||||
for (; width < length; width*=2) {
|
||||
|
||||
for (int i = 0; i < length; i += 2*width) {
|
||||
merge(start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer);
|
||||
}
|
||||
|
||||
workBuffer.clear();
|
||||
multimapFileLong.write(workBuffer, length, start);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void merge(long offset, int left, int right, int end, LongBuffer workBuffer) {
|
||||
long idxL = left;
|
||||
long idxR = right;
|
||||
|
||||
for (int putPos = left; putPos < end; putPos+= wordSize) {
|
||||
final long bufferL = multimapFileLong.get(offset+idxL);
|
||||
final long bufferR = multimapFileLong.get(offset+idxR);
|
||||
|
||||
if (idxL < right && (idxR >= end || bufferL < bufferR)) {
|
||||
workBuffer.put(putPos, bufferL);
|
||||
for (int s = 1; s < wordSize; s++) {
|
||||
workBuffer.put(putPos + s, multimapFileLong.get(offset + idxL + s));
|
||||
}
|
||||
idxL+= wordSize;
|
||||
}
|
||||
else {
|
||||
workBuffer.put(putPos, bufferR);
|
||||
for (int s = 1; s < wordSize; s++) {
|
||||
workBuffer.put(putPos + s, multimapFileLong.get(offset + idxR + s));
|
||||
}
|
||||
idxR+= wordSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void insertionSort(long start, int n) {
|
||||
multimapFileLong.insertionSort(wordSize, start, n);
|
||||
}
|
||||
|
||||
private void swap(long a, long b) {
|
||||
multimapFileLong.swapn(wordSize, a, b);
|
||||
}
|
||||
|
||||
public void quickSort(long start, long length) {
|
||||
quickSortLH(start, start + length - wordSize);
|
||||
|
||||
}
|
||||
public void quickSortLH(long low, long highInclusive) {
|
||||
|
||||
if (low >= 0 && highInclusive >= 0 && low < highInclusive) {
|
||||
|
||||
if (highInclusive - low < 32) {
|
||||
multimapFileLong.insertionSort(wordSize, low, (int) ((wordSize + highInclusive - low) / wordSize));
|
||||
}
|
||||
else {
|
||||
long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive);
|
||||
|
||||
quickSortLH(low, p);
|
||||
quickSortLH(p + wordSize, highInclusive);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -4,17 +4,18 @@ import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.function.IntToDoubleFunction;
|
||||
import java.util.stream.IntStream;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
|
||||
public abstract class RankingAlgorithm {
|
||||
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
||||
@ -154,7 +155,7 @@ public abstract class RankingAlgorithm {
|
||||
}
|
||||
|
||||
|
||||
public TIntList pageRank(int resultCount) {
|
||||
public RoaringBitmap pageRank(int resultCount) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
@ -176,7 +177,7 @@ public abstract class RankingAlgorithm {
|
||||
return rank.getRanking(resultCount);
|
||||
}
|
||||
|
||||
public TIntList pageRankWithPeripheralNodes(int resultCount) {
|
||||
public RoaringBitmap pageRankWithPeripheralNodes(int resultCount) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
@ -303,7 +304,7 @@ public abstract class RankingAlgorithm {
|
||||
return list;
|
||||
}
|
||||
|
||||
public TIntList getRanking(int numResults) {
|
||||
public RoaringBitmap getRanking(int numResults) {
|
||||
if (numResults < 0) {
|
||||
numResults = domainIdToIndex.size();
|
||||
}
|
||||
@ -311,7 +312,7 @@ public abstract class RankingAlgorithm {
|
||||
numResults = rank.length;
|
||||
}
|
||||
|
||||
TIntArrayList list = new TIntArrayList(numResults);
|
||||
RoaringBitmap list = new RoaringBitmap();
|
||||
|
||||
int[] nodes = new int[rank.length];
|
||||
Arrays.setAll(nodes, i->i);
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.util.ranking;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -87,9 +87,9 @@ public class RankingDomainFetcher {
|
||||
|
||||
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
|
||||
stmt.setString(1, pattern);
|
||||
var rsp = stmt.executeQuery();
|
||||
var stmt = conn.createStatement()) {
|
||||
// This is sourced from a config file --v
|
||||
var rsp = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE '" + pattern + "'");
|
||||
while (rsp.next()) {
|
||||
idConsumer.accept(rsp.getInt(1));
|
||||
}
|
||||
|
@ -14,7 +14,7 @@ import nu.marginalia.util.ranking.RankingAlgorithm;
|
||||
import nu.marginalia.util.ranking.RankingDomainData;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -5,7 +5,7 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -51,14 +51,14 @@ public class UpdateDomainRanksTool {
|
||||
rankMax = spr.size()*2;
|
||||
uploader.start();
|
||||
|
||||
spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
|
||||
var rankData = spr.pageRankWithPeripheralNodes(rankMax);
|
||||
for (int i : rankData) {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
running = false;
|
||||
|
@ -5,7 +5,7 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -41,14 +41,14 @@ public class UpdateDomainRanksTool2 {
|
||||
rankMax = rpr.size();
|
||||
uploader.start();
|
||||
|
||||
rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
|
||||
var rankData = rpr.pageRankWithPeripheralNodes(rankMax);
|
||||
for (int i : rankData) {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
running = false;
|
||||
|
@ -7,7 +7,7 @@ import gnu.trove.set.hash.TIntHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.AndCardIntSet;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDaoImpl;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
@ -0,0 +1,246 @@
|
||||
package nu.marginalia.util.tool;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import it.unimi.dsi.fastutil.ints.IntSet;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.AndCardIntSet;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import static nu.marginalia.util.AndCardIntSet.andCardinality;
|
||||
import static nu.marginalia.util.AndCardIntSet.weightedProduct;
|
||||
|
||||
public class EdgeWordWordConsineSimilarityMain {
|
||||
final Object2IntOpenHashMap<String> stringIds;
|
||||
final AndCardIntSet[] dToSMap;
|
||||
final float[] weights;
|
||||
final boolean useWeights = false;
|
||||
|
||||
enum Direction {
|
||||
S_TO_D,
|
||||
D_TO_S
|
||||
}
|
||||
|
||||
final Direction direction = Direction.D_TO_S;
|
||||
|
||||
public EdgeWordWordConsineSimilarityMain(Path dataFile) throws IOException {
|
||||
System.out.println("String IDs");
|
||||
stringIds = mapStringsToIds(dataFile);
|
||||
|
||||
System.out.println("DtoS Map");
|
||||
dToSMap = constructDtoSMap(dataFile, stringIds);
|
||||
|
||||
System.out.println("Weights");
|
||||
|
||||
if (useWeights) {
|
||||
weights = new float[stringIds.size()];
|
||||
for (int i = 0; i < stringIds.size(); i++) {
|
||||
weights[i] = getWeight(i);
|
||||
}
|
||||
}
|
||||
else {
|
||||
weights = null;
|
||||
}
|
||||
|
||||
System.out.println("Ready");
|
||||
}
|
||||
|
||||
private Object2IntOpenHashMap<String> mapStringsToIds(Path dataFile) throws IOException {
|
||||
Object2IntOpenHashMap<String> stringIds = new Object2IntOpenHashMap<>(15_000_000);
|
||||
|
||||
try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) {
|
||||
lines.forEach(line -> {
|
||||
int tab = line.indexOf('\t');
|
||||
if (tab <= 0)
|
||||
return;
|
||||
|
||||
// direction doesn't matter here
|
||||
String from = line.substring(0, tab);
|
||||
String to = line.substring(tab + 1);
|
||||
|
||||
stringIds.putIfAbsent(from, stringIds.size());
|
||||
stringIds.putIfAbsent(to, stringIds.size());
|
||||
});
|
||||
}
|
||||
return stringIds;
|
||||
}
|
||||
|
||||
private AndCardIntSet[] constructDtoSMap(Path dataFile, Object2IntOpenHashMap<String> stringIds) throws IOException {
|
||||
Map<Integer, RoaringBitmap> tmpMap = new HashMap<>(15_000_000);
|
||||
|
||||
try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) {
|
||||
lines.forEach(line -> {
|
||||
int tab = line.indexOf('\t');
|
||||
if (tab <= 0) return;
|
||||
|
||||
String from, to;
|
||||
if (direction == Direction.S_TO_D) {
|
||||
from = line.substring(0, tab);
|
||||
to = line.substring(tab + 1);
|
||||
}
|
||||
else {
|
||||
from = line.substring(tab + 1);
|
||||
to = line.substring(0, tab);
|
||||
}
|
||||
|
||||
tmpMap.computeIfAbsent(stringIds.getInt(to), this::createBitmapWithSelf).add(stringIds.getInt(from));
|
||||
});
|
||||
}
|
||||
|
||||
AndCardIntSet[] dToSMap = new AndCardIntSet[stringIds.size()];
|
||||
tmpMap.entrySet().stream()
|
||||
.filter(e -> isEligible(e.getValue()))
|
||||
.forEach(e -> dToSMap[e.getKey()] = AndCardIntSet.of(e.getValue()));
|
||||
|
||||
return dToSMap;
|
||||
}
|
||||
|
||||
private boolean isEligible(RoaringBitmap value) {
|
||||
int cardinality = value.getCardinality();
|
||||
|
||||
return cardinality > 50;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void tryDomains(String... word) {
|
||||
|
||||
System.out.println(Arrays.toString(word));
|
||||
|
||||
int[] domainIds = Arrays.stream(word).mapToInt(stringIds::getInt).toArray();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
findAdjacentDtoS(new IntOpenHashSet(domainIds), similarities -> {
|
||||
Set<Integer> ids = similarities.similarities().stream().map(Similarity::id).collect(Collectors.toSet());
|
||||
|
||||
Map<Integer, String> reveseIds = new HashMap<>(similarities.similarities.size());
|
||||
|
||||
stringIds.forEach((str, id) -> {
|
||||
if (ids.contains(id)) {
|
||||
reveseIds.put(id, str);
|
||||
}
|
||||
});
|
||||
|
||||
for (var similarity : similarities.similarities()) {
|
||||
System.out.println(reveseIds.get(similarity.id) + "\t" + dToSMap[similarity.id].getCardinality() + "\t" + prettyPercent(similarity.value));
|
||||
}
|
||||
});
|
||||
|
||||
System.out.println(System.currentTimeMillis() - start);
|
||||
}
|
||||
|
||||
private String prettyPercent(double val) {
|
||||
return String.format("%2.2f%%", 100. * val);
|
||||
}
|
||||
|
||||
|
||||
public RoaringBitmap createBitmapWithSelf(int val) {
|
||||
var bm = new RoaringBitmap();
|
||||
bm.add(val);
|
||||
return bm;
|
||||
}
|
||||
|
||||
double cosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
|
||||
double andCardinality = andCardinality(a, b);
|
||||
andCardinality /= Math.sqrt(a.getCardinality());
|
||||
andCardinality /= Math.sqrt(b.getCardinality());
|
||||
return andCardinality;
|
||||
}
|
||||
|
||||
double expensiveCosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
|
||||
return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights));
|
||||
}
|
||||
|
||||
float getWeight(int i) {
|
||||
var vector = dToSMap[i];
|
||||
|
||||
if (vector == null) return 1.0f;
|
||||
return 1.0f / (float) Math.log(2+vector.getCardinality());
|
||||
}
|
||||
|
||||
record Similarities(int id, List<Similarity> similarities) {};
|
||||
record Similarity(int id, double value) {};
|
||||
|
||||
@SneakyThrows
|
||||
private void findAdjacentDtoS(IntSet ids, Consumer<Similarities> andThen) {
|
||||
|
||||
|
||||
AndCardIntSet[] vectors = ids.intStream().mapToObj(id -> dToSMap[id]).toArray(AndCardIntSet[]::new);
|
||||
for (var vector : vectors) {
|
||||
if (null == vector)
|
||||
return;
|
||||
}
|
||||
|
||||
var vector = Arrays.stream(vectors).reduce(AndCardIntSet::and).orElseThrow();
|
||||
|
||||
List<Similarity> similarities = IntStream.range(0, dToSMap.length).parallel().mapToObj(
|
||||
id -> vectorSimilarity(ids, vector, id))
|
||||
.filter(Objects::nonNull)
|
||||
.sorted(Comparator.comparing(Similarity::value))
|
||||
.toList();
|
||||
|
||||
|
||||
andThen.accept(new Similarities(0, similarities));
|
||||
}
|
||||
|
||||
double cardinalityLimit = 0.1;
|
||||
|
||||
private Similarity vectorSimilarity(IntSet ids, AndCardIntSet vector, int id) {
|
||||
|
||||
/* The minimum cardinality a vector can have so that
|
||||
*
|
||||
* a (x) b
|
||||
* ------- < k is given by k^2
|
||||
* |a||b|
|
||||
*
|
||||
*/
|
||||
|
||||
final double cardMin = Math.min(2, cardinalityLimit * cardinalityLimit * vector.getCardinality());
|
||||
|
||||
if (ids.contains(id) || id >= dToSMap.length)
|
||||
return null;
|
||||
|
||||
var otherVec = dToSMap[id];
|
||||
if (otherVec == null || otherVec.getCardinality() < cardMin)
|
||||
return null;
|
||||
|
||||
double similarity = cosineSimilarity(vector, otherVec);
|
||||
|
||||
if (similarity > 0.1) {
|
||||
if (useWeights) {
|
||||
var recalculated = expensiveCosineSimilarity(vector, otherVec);
|
||||
if (recalculated > 0.1) {
|
||||
return new Similarity(id, recalculated);
|
||||
}
|
||||
}
|
||||
else {
|
||||
return new Similarity(id, similarity);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
var main = new EdgeWordWordConsineSimilarityMain(Path.of(args[0]));
|
||||
|
||||
for (;;) {
|
||||
String line = System.console().readLine("Words> ");
|
||||
if (line == null || line.isBlank()) {
|
||||
break;
|
||||
}
|
||||
|
||||
main.tryDomains(line.split("\\s+"));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,43 @@
|
||||
package nu.marginalia.util.tool;
|
||||
|
||||
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
public class WikipediaInternalLinkExtractorMain {
|
||||
public static void main(String... args) throws InterruptedException {
|
||||
new WikipediaReader(args[0], new EdgeDomain("en.wikipedia.org"), wikipediaArticle -> {
|
||||
|
||||
|
||||
var doc = Jsoup.parse(wikipediaArticle.body);
|
||||
String path = wikipediaArticle.url.path.substring("/wiki/".length());
|
||||
|
||||
if (isIncluded(path)) {
|
||||
Set<String> seen = new HashSet<>(100);
|
||||
|
||||
for (var atag : doc.getElementsByTag("a")) {
|
||||
String href = atag.attr("href");
|
||||
|
||||
if (href.contains("#")) {
|
||||
href = href.substring(0, href.indexOf('#'));
|
||||
}
|
||||
|
||||
if (isIncluded(href) && href.length() > 2 && seen.add(href)) {
|
||||
System.out.println(path + "\t" + href);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}).join();
|
||||
}
|
||||
|
||||
private static boolean isIncluded(String href) {
|
||||
return !href.contains(":")
|
||||
&& !href.contains("/")
|
||||
&& !href.contains("%")
|
||||
&& !href.startsWith("#");
|
||||
}
|
||||
}
|
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.api.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||
|
||||
@ -33,12 +34,12 @@ public class ApiSearchResult {
|
||||
for (var entries : bySet.values()) {
|
||||
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
|
||||
for (var entry : entries) {
|
||||
var metadata = entry.metadata();
|
||||
var metadata = new EdgePageWordMetadata(entry.encodedWordMetadata());
|
||||
if (metadata.isEmpty())
|
||||
continue outer;
|
||||
|
||||
Set<String> flags = metadata.flags().stream().map(Object::toString).collect(Collectors.toSet());
|
||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(),metadata.count(), flags));
|
||||
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), metadata.count(), flags));
|
||||
}
|
||||
details.add(lst);
|
||||
}
|
||||
|
@ -14,7 +14,6 @@ import nu.marginalia.wmsa.podcasts.PodcastScraperMain;
|
||||
import nu.marginalia.wmsa.renderer.RendererMain;
|
||||
import nu.marginalia.wmsa.resource_store.ResourceStoreMain;
|
||||
import nu.marginalia.wmsa.smhi.scraper.SmhiScraperMain;
|
||||
import org.apache.logging.log4j.core.lookup.MainMapLookup;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
@ -78,7 +77,6 @@ public enum ServiceDescriptor {
|
||||
}
|
||||
|
||||
public static void main(String... args) {
|
||||
MainMapLookup.setMainArguments(args);
|
||||
Map<String, Command> functions = Stream.of(
|
||||
new ListCommand(),
|
||||
new StartCommand(),
|
||||
|
@ -4,7 +4,7 @@ import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -6,6 +6,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
@ -62,7 +63,7 @@ public class ConversionLog implements AutoCloseable, Interpreter {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
|
||||
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) {}
|
||||
|
||||
@Override
|
||||
public void loadDomainRedirect(DomainLink link) {}
|
||||
|
@ -27,7 +27,6 @@ public class ConverterModule extends AbstractModule {
|
||||
bind(Gson.class).toInstance(createGson());
|
||||
|
||||
bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.);
|
||||
bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.);
|
||||
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250);
|
||||
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
||||
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
||||
|
@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
@ -120,7 +121,8 @@ public class LoadInstructionWriter {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
|
||||
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadDomainRedirect(DomainLink link) {}
|
||||
|
@ -8,6 +8,8 @@ import okhttp3.Request;
|
||||
import okhttp3.RequestBody;
|
||||
import okio.BufferedSink;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
@ -15,9 +17,9 @@ import java.nio.charset.Charset;
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
|
||||
|
||||
public class ReindexTriggerMain {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ReindexTriggerMain.class);
|
||||
|
||||
public static void main(String... args) throws IOException, SQLException {
|
||||
var db = new DatabaseModule();
|
||||
@ -28,6 +30,7 @@ public class ReindexTriggerMain {
|
||||
.followRedirects(true)
|
||||
.build();
|
||||
|
||||
logger.info("Updating statistics");
|
||||
var updateStatistics = new UpdateDomainStatistics(db.provideConnection());
|
||||
updateStatistics.run();
|
||||
|
||||
@ -45,15 +48,10 @@ public class ReindexTriggerMain {
|
||||
}
|
||||
};
|
||||
|
||||
logger.info("Repartitioning");
|
||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute();
|
||||
|
||||
if (!Boolean.getBoolean("no-preconvert")) {
|
||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute();
|
||||
}
|
||||
|
||||
for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) {
|
||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute();
|
||||
}
|
||||
logger.info("Reindexing");
|
||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex")).build()).execute();
|
||||
|
||||
}
|
||||
|
||||
|
@ -7,8 +7,6 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedD
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlockType;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -41,18 +39,8 @@ public class DocumentsCompiler {
|
||||
var words = doc.words;
|
||||
|
||||
if (words != null) {
|
||||
|
||||
var wordsArray = words.values().stream()
|
||||
.filter(this::filterNonTransients)
|
||||
.map(DocumentKeywords::new)
|
||||
.toArray(DocumentKeywords[]::new);
|
||||
|
||||
ret.add(new LoadKeywords(doc.url, wordsArray));
|
||||
ret.add(new LoadKeywords(doc.url, doc.details.metadata, new DocumentKeywords(words)));
|
||||
}
|
||||
}
|
||||
|
||||
private boolean filterNonTransients(EdgePageWords words) {
|
||||
return words.block.type != IndexBlockType.TRANSIENT;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
@ -18,7 +19,7 @@ public interface Interpreter {
|
||||
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
|
||||
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
|
||||
|
||||
void loadKeywords(EdgeUrl url, DocumentKeywords[] words);
|
||||
void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words);
|
||||
|
||||
void loadDomainRedirect(DomainLink link);
|
||||
}
|
||||
|
@ -1,18 +1,16 @@
|
||||
package nu.marginalia.wmsa.edge.converting.interpreter.instruction;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public record DocumentKeywords(IndexBlock block,
|
||||
public record DocumentKeywords(
|
||||
String[] keywords,
|
||||
long[] metadata) {
|
||||
|
||||
public DocumentKeywords(EdgePageWords words) {
|
||||
this(words.block,
|
||||
words.words.toArray(String[]::new),
|
||||
this(words.words.toArray(String[]::new),
|
||||
words.metadata.toArray());
|
||||
}
|
||||
|
||||
@ -20,7 +18,7 @@ public record DocumentKeywords(IndexBlock block,
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(getClass().getSimpleName());
|
||||
sb.append('[').append(block).append(", ");
|
||||
sb.append('[');
|
||||
for (int i = 0; i < keywords.length; i++) {
|
||||
sb.append("\n\t ");
|
||||
if (metadata[i] != 0) {
|
||||
@ -42,6 +40,6 @@ public record DocumentKeywords(IndexBlock block,
|
||||
}
|
||||
|
||||
public DocumentKeywords subList(int start, int end) {
|
||||
return new DocumentKeywords(block, Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end));
|
||||
return new DocumentKeywords(Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end));
|
||||
}
|
||||
}
|
||||
|
@ -1,12 +1,10 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
package nu.marginalia.wmsa.edge.converting.interpreter.instruction;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class ListChunker {
|
||||
public class KeywordListChunker {
|
||||
|
||||
/** Chops data into a list of lists of max length size
|
||||
*
|
@ -3,20 +3,19 @@ package nu.marginalia.wmsa.edge.converting.interpreter.instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public record LoadKeywords(EdgeUrl url, DocumentKeywords... words) implements Instruction {
|
||||
public record LoadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) implements Instruction {
|
||||
|
||||
@Override
|
||||
public void apply(Interpreter interpreter) {
|
||||
interpreter.loadKeywords(url, words);
|
||||
interpreter.loadKeywords(url, metadata, words);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isNoOp() {
|
||||
return words.length == 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -26,7 +25,7 @@ public record LoadKeywords(EdgeUrl url, DocumentKeywords... words) implements In
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName()+"["+ Arrays.toString(words)+"]";
|
||||
return getClass().getSimpleName()+"["+ words+"]";
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import org.slf4j.Logger;
|
||||
@ -19,7 +20,7 @@ public class IndexLoadKeywords implements Runnable {
|
||||
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
|
||||
private final EdgeIndexWriterClient client;
|
||||
|
||||
private record InsertTask(int urlId, int domainId, DocumentKeywords wordSet) {}
|
||||
private record InsertTask(int urlId, int domainId, EdgePageDocumentsMetadata metadata, DocumentKeywords wordSet) {}
|
||||
|
||||
private final Thread runThread;
|
||||
private volatile boolean canceled = false;
|
||||
@ -38,7 +39,7 @@ public class IndexLoadKeywords implements Runnable {
|
||||
while (!canceled) {
|
||||
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
||||
if (data != null) {
|
||||
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index);
|
||||
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet, index);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -48,7 +49,7 @@ public class IndexLoadKeywords implements Runnable {
|
||||
runThread.join();
|
||||
}
|
||||
|
||||
public void load(LoaderData loaderData, EdgeUrl url, DocumentKeywords[] words) throws InterruptedException {
|
||||
public void load(LoaderData loaderData, EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) throws InterruptedException {
|
||||
int domainId = loaderData.getDomainId(url.domain);
|
||||
int urlId = loaderData.getUrlId(url);
|
||||
|
||||
@ -57,8 +58,6 @@ public class IndexLoadKeywords implements Runnable {
|
||||
return;
|
||||
}
|
||||
|
||||
for (var ws : words) {
|
||||
insertQueue.put(new InsertTask(urlId, domainId, ws));
|
||||
}
|
||||
insertQueue.put(new InsertTask(urlId, domainId, metadata, words));
|
||||
}
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
@ -108,8 +109,8 @@ public class Loader implements Interpreter {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {
|
||||
logger.debug("loadKeywords(#{})", words.length);
|
||||
public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) {
|
||||
logger.debug("loadKeywords()");
|
||||
|
||||
// This is a bit of a bandaid safeguard against a bug in
|
||||
// in the converter, shouldn't be necessary in the future
|
||||
@ -124,7 +125,7 @@ public class Loader implements Interpreter {
|
||||
}
|
||||
|
||||
try {
|
||||
indexLoadKeywords.load(data, url, words);
|
||||
indexLoadKeywords.load(data, url, metadata, words);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.converting.model;
|
||||
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
||||
|
||||
import java.util.OptionalDouble;
|
||||
@ -12,7 +12,7 @@ public class ProcessedDocument {
|
||||
public EdgeUrl url;
|
||||
|
||||
public ProcessedDocumentDetails details;
|
||||
public EdgePageWordSet words;
|
||||
public EdgePageWords words;
|
||||
|
||||
public EdgeUrlState state;
|
||||
public String stateReason;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user